From ee57a5ea79dcce6a438ecd2042d3ca7596c510a5 Mon Sep 17 00:00:00 2001
From: Takuya Matsuda <matsuda-takuya-kn@ynu.jp>
Date: Wed, 5 May 2021 04:42:49 +0900
Subject: [PATCH] add: ICLR2021 data

I executed your notebook and made iclr2021.json. Also, I added a new line to *.html file to include ICLR 2021 and changed the default conference year in js/index.js.
---
 .DS_Store          |   Bin 0 -> 6148 bytes
 about.html         |     1 +
 charts.html        |     1 +
 data/.DS_Store     |   Bin 0 -> 6148 bytes
 data/iclr2021.json | 95130 +++++++++++++++++++++++++++++++++++++++++++
 index.html         |     1 +
 js/index.js        |     2 +-
 7 files changed, 95134 insertions(+), 1 deletion(-)
 create mode 100644 .DS_Store
 create mode 100644 data/.DS_Store
 create mode 100644 data/iclr2021.json
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..91ae7b26c0bec65635fac45f24b1e95c84f8d28f
GIT binary patch
literal 6148
zcmeHK%Wl&^6upxKuw7MTfkas#dBZM>k_c6>s1(w4(FFn#1Pef|9b1i&>xp6~p@g99
z`2tw*3w#G3fcO?xaOOc2JI$sGgrd39%(<SqXU0DB@(>a0jDsdom53}PffWbYB_`@c
zTC<Wa*#$D$Mn-iSP)6qzuSJ^zqkvK1e^WrLU57mKiPIP>ce#F#W8U{;?xS984v&!P
zzoo$=+MzyHx<?~$(v7Td1(pLAr-UXHN@ksgbdm3g-7Rn`6ri7syh7ZkO>zD-9qKp#
z?hoTA%^HoLtW>UCyMDv69LL#p-t-21+RMCboOZm?EAjNy%LcyopK~5Beb0Xph0~Mj
ztwWw<UX+9*NsfXLA+KLXNx-KaK2CyEw$oD;mSx-aNp){FYc=+3?!)`@nmcPY9yDw2
zeycT~+t%%S&4c69^NaVh5A%<o@i8!lZB*J#jep?_oTU(x-YAKAa)vSyZ&L(G14#M~
z1JTEDqy?#dA!>E-oE$@M*m+J#w7$wI(c5CXr5~5YY^(fIfP(aQ=m}<U2>G8OpFq}-
zrsQG>`k3Dy>cr5TVOSzD)8YGhnahU-32n4=gw@ui$G|#H(jd=}EAi{fN<;%!G(Fj@
zoh$MU%_v|L*j@$1{@@}BY-?O7lv@Wfc?AGgP%I6;_+1bjS7Te_LLq8kOi_V~%Jdb3
zDLTqs9oN>lP^jp{^yP!;o|(R(FxflWcV#%SwnCE|1&jiD1<LBND$f6-&G-L2$vhba
zi~?Jw0IPI+-3~rUpRH>jC(c?I=@dzrh+8O>6lD51mX0`z_mHHa&J_-@t#P3cB{1_N
NAZ0L_QQ)5{@CP|53)KJs

literal 0
HcmV?d00001

diff --git a/about.html b/about.html
index b109ea4..eea5a34 100644
--- a/about.html
+++ b/about.html
@@ -15,6 +15,7 @@ <h1>ICLR Open Review Papers</h1>
     <a style='float:right' href='?conf=iclr2018'>ICLR 2018</a>
     <a style='float:right' href='?conf=iclr2019'>ICLR 2019</a>
     <a style='float:right' href='?conf=iclr2020'>ICLR 2020</a>
+    <a style='float:right' href='?conf=iclr2021'>ICLR 2021</a>
 </div>
 <h3> Horace He</h3>
 <h3>Cornell 2020</h3>
diff --git a/charts.html b/charts.html
index 8989ddd..50894b9 100644
--- a/charts.html
+++ b/charts.html
@@ -15,6 +15,7 @@ <h1>ICLR 2018 Open Review Papers</h1>
     <a style='float:right' href='?conf=iclr2018'>ICLR 2018</a>
     <a style='float:right' href='?conf=iclr2019'>ICLR 2019</a>
     <a style='float:right' href='?conf=iclr2020'>ICLR 2020</a>
+    <a style='float:right' href='?conf=iclr2021'>ICLR 2021</a>
 </div>
 
 <div id="histogram"></div>
diff --git a/data/.DS_Store b/data/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..85064f8826603337727e3ad2e40555c7802c9f9f
GIT binary patch
literal 6148
zcmeHKOHRWu5Pfd5C?KRt2#F0+PEa*nK)Va2ELd{@pdTSnX%K~d_S}Iha2gK73f|Zj
zN*#>40)%F&ndjJk<M`EyV*qCOIL&}IfEHaa*<|>_q+Psb4PPTXV{>$HgCz<qPUp4W
zTA>Q40)I^bd3HB2#U&o-mFKsZjOWFet@3Jkp+CP{V+q*CH71zj9zArC;V{ZPV(-p3
z%zl(9V^*2F!I*vCL4jNTW?1m<m$6J{KgiUsWxC@`W>NlAzz=6Xh%uYT0dpr9d4O}O
zmoav65X)lT*3ObKPvk#rT&<m@$2*E;c@46xY9Xs3!i21Z>Kfvu+OI0WJzK1`<xrz4
zpbDr0p9;wPA*BoE9!rPz)xlto0K|mV)>xJg2H~V0bC0D%w$O~D5*^j}BZhHw)~CK-
z?y+>}=rI29Vf@I(pHPe)o%vJS9VU0EQ58@HRuyQv%eL(QC-L+DDoL+Y0af5%DPWS(
z*=Wcq`MtF^IoWGd`UPD~`jrkXg~4yfwjf*a9^D$-QYjE~kEKJ_(DXyV%Ai3N_)!Hu
E0G~&tGXMYp

literal 0
HcmV?d00001

diff --git a/data/iclr2021.json b/data/iclr2021.json
new file mode 100644
index 0000000..bb23a9d
--- /dev/null
+++ b/data/iclr2021.json
@@ -0,0 +1,95130 @@
+[
+  {
+    "url": "https://openreview.net/forum?id=UH-cmocLJC",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      9,
+      9,
+      8
+    ],
+    "rating": "8.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study how neural networks trained by gradient descent  extrapolate, i.e., what they learn outside the support of the training distribution. Previous works report mixed empirical results when extrapolating with neural networks: while feedforward neural networks, a.k.a. multilayer perceptrons (MLPs), do not extrapolate well in certain simple tasks, Graph Neural Networks (GNNs) -- structured networks with MLP modules -- have shown some success in more complex tasks.  Working towards a theoretical explanation, we identify conditions under which MLPs and GNNs extrapolate well. First, we quantify the observation that ReLU MLPs quickly converge to linear functions along any direction from the origin, which implies that ReLU MLPs do not extrapolate most nonlinear functions. But, they can provably learn a linear target function when the training distribution is sufficiently diverse. Second, in connection to analyzing the successes and limitations of GNNs, these results suggest a hypothesis for which we provide theoretical and empirical evidence: the success of GNNs in extrapolating algorithmic tasks to new data (e.g., larger graphs or edge weights) relies on encoding task-specific non-linearities in the architecture or features. Our theoretical analysis builds on a connection of over-parameterized networks to the neural tangent kernel.  Empirically, our theory holds across different training settings.",
+    "title": "How Neural Networks Extrapolate: From Feedforward to Graph Neural Networks",
+    "authors": [
+      "Keyulu Xu",
+      "Mozhi Zhang",
+      "Jingling Li",
+      "Simon Shaolei Du",
+      "Ken-Ichi Kawarabayashi",
+      "Stefanie Jegelka"
+    ],
+    "emails": [
+      "~Stefanie_Jegelka3",
+      "~Keyulu_Xu1",
+      "~Jingling_Li1",
+      "~Ken-Ichi_Kawarabayashi1",
+      "~Mozhi_Zhang1",
+      "~Simon_Shaolei_Du1"
+    ],
+    "rank": 1
+  },
+  {
+    "url": "https://openreview.net/forum?id=mSAKhLYLSsl",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      9,
+      8
+    ],
+    "rating": "8.40",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "As the state-of-the-art machine learning methods in many fields rely on larger datasets, storing datasets and training models on them become significantly more expensive. This paper proposes a training set synthesis technique for data-efficient learning, called Dataset Condensation, that learns to condense large dataset into a small set of informative synthetic samples for training deep neural networks from scratch. We formulate this goal as a gradient matching problem between the gradients of deep neural network weights that are trained on the original and our synthetic data. We rigorously evaluate its performance in several computer vision benchmarks and demonstrate that it significantly outperforms the state-of-the-art methods. Finally we explore the use of our method in continual learning and neural architecture search and report promising gains when limited memory and computations are available.",
+    "title": "Dataset Condensation with Gradient Matching",
+    "authors": [
+      "Bo Zhao",
+      "Konda Reddy Mopuri",
+      "Hakan Bilen"
+    ],
+    "emails": [
+      "~Bo_Zhao4",
+      "~Konda_Reddy_Mopuri3",
+      "~Hakan_Bilen1"
+    ],
+    "rank": 2
+  },
+  {
+    "url": "https://openreview.net/forum?id=EbIDjBynYJ8",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      9,
+      8,
+      9
+    ],
+    "rating": "8.29",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Disentangling the underlying generative factors from complex data has so far been limited to carefully constructed scenarios. We propose a path towards natural data by first showing that the statistics of natural data provide enough structure to enable disentanglement, both theoretically and empirically. Specifically, we provide evidence that objects in natural movies undergo transitions that are typically small in magnitude with occasional large jumps, which is characteristic of a temporally sparse distribution. To address this finding we provide a novel proof that relies on a sparse prior on temporally adjacent observations to recover the true latent variables up to permutations and sign flips, directly providing a stronger result than previous work. We show that equipping practical estimation methods with our prior often surpasses the current state-of-the-art on several established benchmark datasets without any impractical assumptions, such as knowledge of the number of changing generative factors. Furthermore, we contribute two new benchmarks, Natural Sprites and KITTI Masks, which integrate the measured natural dynamics to enable disentanglement evaluation with more realistic datasets. We leverage these benchmarks to test our theory, demonstrating improved performance. We also identify non-obvious challenges for current methods in scaling to more natural domains. Taken together our work addresses key issues in disentanglement research for moving towards more natural settings. ",
+    "title": "Towards Nonlinear Disentanglement in Natural Data with Temporal Sparse Coding",
+    "authors": [
+      "David A. Klindt",
+      "Lukas Schott",
+      "Yash Sharma",
+      "Ivan Ustyuzhaninov",
+      "Wieland Brendel",
+      "Matthias Bethge",
+      "Dylan Paiton"
+    ],
+    "emails": [
+      "~Wieland_Brendel1",
+      "~Lukas_Schott2",
+      "~Matthias_Bethge1",
+      "~David_A._Klindt1",
+      "~Ivan_Ustyuzhaninov1",
+      "gmail.com",
+      "~Yash_Sharma1"
+    ],
+    "rank": 3
+  },
+  {
+    "url": "https://openreview.net/forum?id=UuchYL8wSZo",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      8,
+      8,
+      8
+    ],
+    "rating": "8.21",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "A growing body of research suggests that embodied gameplay, prevalent not just in human cultures but across a variety of animal species including turtles and ravens, is critical in developing the neural flexibility for creative problem solving, decision making, and socialization. Comparatively little is known regarding the impact of embodied gameplay upon artificial agents. While recent work has produced agents proficient in abstract games, these environments are far removed the real world and thus these agents can provide little insight into the advantages of embodied play. Hiding games, such as hide-and-seek, played universally, provide a rich ground for studying the impact of embodied gameplay on representation learning in the context of perspective taking, secret keeping, and false belief understanding. Here we are the first to show that embodied adversarial reinforcement learning agents playing Cache, a variant of hide-and-seek, in a high fidelity, interactive, environment, learn generalizable representations of their observations encoding information such as object permanence, free space, and containment. Moving closer to biologically motivated learning strategies, our agents' representations, enhanced by intentionality and memory, are developed through interaction and play. These results serve as a model for studying how facets of vision develop through interaction, provide an experimental framework for assessing what is learned by artificial agents, and demonstrates the value of moving from large, static, datasets towards experiential, interactive, representation learning.",
+    "title": "Learning Generalizable Visual Representations via Interactive Gameplay",
+    "authors": [
+      "Luca Weihs",
+      "Aniruddha Kembhavi",
+      "Kiana Ehsani",
+      "Sarah M Pratt",
+      "Winson Han",
+      "Alvaro Herrasti",
+      "Eric Kolve",
+      "Dustin Schwenk",
+      "Roozbeh Mottaghi",
+      "Ali Farhadi"
+    ],
+    "emails": [
+      "~Eric_Kolve1",
+      "~Roozbeh_Mottaghi1",
+      "~Sarah_M_Pratt1",
+      "~Kiana_Ehsani1",
+      "~Luca_Weihs1",
+      "allenai.org",
+      "~Aniruddha_Kembhavi1",
+      "~Ali_Farhadi3"
+    ],
+    "rank": 4
+  },
+  {
+    "url": "https://openreview.net/forum?id=QIRlze3I6hX",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      6,
+      8,
+      7,
+      10
+    ],
+    "rating": "8.08",
+    "confidences": [
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "At the heart of many robotics problems is the challenge of learning correspondences across domains. For instance, imitation learning requires obtaining correspondence between humans and robots; sim-to-real requires correspondence between physics simulators and real hardware; transfer learning requires correspondences between different robot environments. In this paper, we propose to learn correspondence across such domains emphasizing on differing modalities (vision and internal state), physics parameters (mass and friction), and morphologies (number of limbs). Importantly, correspondences are learned using unpaired and randomly collected data from the two domains. We propose dynamics cycles that align dynamic robotic behavior across two domains using a cycle consistency constraint. Once this correspondence is found, we can directly transfer the policy trained on one domain to the other, without needing any additional fine-tuning on the second domain. We perform experiments across a variety of problem domains, both in simulation and on real robots. Our framework is able to align uncalibrated monocular video of a real robot arm to dynamic state-action trajectories of a simulated arm without paired data. Video demonstrations of our results are available at: <a href=\"https://sites.google.com/view/cycledynamics\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/cycledynamics</a> .",
+    "title": "Learning Cross-Domain Correspondence for Control with Dynamics Cycle-Consistency",
+    "authors": [
+      "Qiang Zhang",
+      "Tete Xiao",
+      "Alexei A Efros",
+      "Lerrel Pinto",
+      "Xiaolong Wang"
+    ],
+    "emails": [
+      "~Alexei_A_Efros1",
+      "~Tete_Xiao1",
+      "~Lerrel_Pinto1",
+      "~Xiaolong_Wang3",
+      "~Qiang_Zhang5"
+    ],
+    "rank": 5
+  },
+  {
+    "url": "https://openreview.net/forum?id=HajQFbx_yB",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      7,
+      8
+    ],
+    "rating": "8.08",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Determinantal point processes (DPPs) have attracted significant attention in machine learning for their ability to model subsets drawn from a large item collection. Recent work shows that nonsymmetric DPP (NDPP) kernels have significant advantages over symmetric kernels in terms of modeling power and predictive performance. However, for an item collection of size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container>, existing NDPP learning and inference algorithms require memory quadratic in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> and runtime cubic (for learning) or quadratic (for inference) in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container>, making them impractical for many typical subset selection tasks. In this work, we develop a learning algorithm with space and time requirements linear in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> by introducing a new NDPP kernel decomposition. We also derive a linear-complexity NDPP maximum a posteriori (MAP) inference algorithm that applies not only to our new kernel but also to that of prior work. Through evaluation on real-world datasets, we show that our algorithms scale significantly better, and can match the predictive performance of prior work.",
+    "title": "Scalable Learning and MAP Inference for Nonsymmetric Determinantal Point Processes",
+    "authors": [
+      "Mike Gartrell",
+      "Insu Han",
+      "Elvis Dohmatob",
+      "Jennifer Gillenwater",
+      "Victor-Emmanuel Brunel"
+    ],
+    "emails": [
+      "~Victor-Emmanuel_Brunel1",
+      "~Jennifer_Gillenwater1",
+      "~Elvis_Dohmatob1",
+      "~Mike_Gartrell1",
+      "~Insu_Han1"
+    ],
+    "rank": 6
+  },
+  {
+    "url": "https://openreview.net/forum?id=PxTIG12RRHS",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      9,
+      7,
+      8
+    ],
+    "rating": "8.07",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. \nCrucially, the reverse-time SDE depends only on the time-dependent gradient field (a.k.a., score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1024</mn><mo>\u00d7</mo><mn>1024</mn></math></mjx-assistive-mml></mjx-container> images for the first time from a score-based generative model.",
+    "title": "Score-Based Generative Modeling through Stochastic Differential Equations",
+    "authors": [
+      "Yang Song",
+      "Jascha Sohl-Dickstein",
+      "Diederik P Kingma",
+      "Abhishek Kumar",
+      "Stefano Ermon",
+      "Ben Poole"
+    ],
+    "emails": [
+      "~Yang_Song1",
+      "~Jascha_Sohl-Dickstein2",
+      "~Abhishek_Kumar1",
+      "~Stefano_Ermon1",
+      "~Diederik_P_Kingma1",
+      "~Ben_Poole1"
+    ],
+    "rank": 7
+  },
+  {
+    "url": "https://openreview.net/forum?id=rC8sJ4i6kaH",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      7,
+      9,
+      7
+    ],
+    "rating": "8.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Self-training algorithms, which train a model to fit pseudolabels predicted by another previously-learned model, have been very successful for learning with unlabeled data using neural networks. However, the current theoretical understanding of self-training only applies to linear models. This work provides a unified theoretical analysis of self-training with deep networks for semi-supervised learning, unsupervised domain adaptation, and unsupervised learning. At the core of our analysis is a simple but realistic \u201cexpansion\u201d assumption, which states that a low-probability subset of the data must expand to a neighborhood with large probability relative to the subset. We also assume that neighborhoods of examples in different classes have minimal overlap. We prove that under these assumptions, the minimizers of population objectives based on self-training and input-consistency regularization will achieve high accuracy with respect to ground-truth labels. By using off-the-shelf generalization bounds, we immediately convert this result to sample complexity guarantees for neural nets that are polynomial in the margin and Lipschitzness. Our results help explain the empirical successes of recently proposed self-training algorithms which use input consistency regularization.",
+    "title": "Theoretical Analysis of Self-Training with Deep Networks on Unlabeled Data",
+    "authors": [
+      "Colin Wei",
+      "Kendrick Shen",
+      "Yining Chen",
+      "Tengyu Ma"
+    ],
+    "emails": [
+      "~Tengyu_Ma1",
+      "stanford.edu",
+      "~Colin_Wei1",
+      "~Yining_Chen1"
+    ],
+    "rank": 8
+  },
+  {
+    "url": "https://openreview.net/forum?id=gZ9hCDWe6ke",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      8,
+      8,
+      7
+    ],
+    "rating": "8.06",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach. Code is released at <a href=\"https://github.com/fundamentalvision/Deformable-DETR\" target=\"_blank\" rel=\"nofollow\">https://github.com/fundamentalvision/Deformable-DETR</a>.",
+    "title": "Deformable DETR: Deformable Transformers for End-to-End Object Detection",
+    "authors": [
+      "Xizhou Zhu",
+      "Weijie Su",
+      "Lewei Lu",
+      "Bin Li",
+      "Xiaogang Wang",
+      "Jifeng Dai"
+    ],
+    "emails": [
+      "~Jifeng_Dai1",
+      "sensetime.com",
+      "~Xiaogang_Wang2",
+      "ustc.edu.cn",
+      "~Xizhou_Zhu1",
+      "~Weijie_Su2"
+    ],
+    "rank": 9
+  },
+  {
+    "url": "https://openreview.net/forum?id=RGJbergVIoO",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      10,
+      7,
+      7
+    ],
+    "rating": "8.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Hopfield networks (HNs) and Restricted Boltzmann Machines (RBMs) are two important models at the interface of statistical physics, machine learning, and neuroscience. Recently, there has been interest in the relationship between HNs and RBMs, due to their similarity under the statistical mechanics formalism. An exact mapping between HNs and RBMs has been previously noted for the special case of orthogonal (\u201cuncorrelated\u201d) encoded patterns. We present here an exact mapping in the case of correlated pattern HNs, which are more broadly applicable to existing datasets. Specifically, we show that any HN with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> binary variables and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo>&lt;</mo><mi>N</mi></math></mjx-assistive-mml></mjx-container> potentially correlated binary patterns can be transformed into an RBM with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> binary visible variables and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container> gaussian hidden variables. We outline the conditions under which the reverse mapping exists, and conduct experiments on the MNIST dataset which suggest the mapping provides a useful initialization to the RBM weights. We discuss extensions, the potential importance of this correspondence for the training of RBMs, and for understanding the performance of feature extraction methods which utilize RBMs.",
+    "title": "On the mapping between Hopfield networks and Restricted Boltzmann Machines",
+    "authors": [
+      "Matthew Smart",
+      "Anton Zilman"
+    ],
+    "emails": [
+      "physics.utoronto.ca",
+      "~Matthew_Smart1"
+    ],
+    "rank": 10
+  },
+  {
+    "url": "https://openreview.net/forum?id=rcQdycl0zyk",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      8,
+      8
+    ],
+    "rating": "8.00",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Recent works have demonstrated reasonable success of representation learning in hypercomplex space. Specifically, \u201cfully-connected layers with quaternions\u201d (quaternions are 4D hypercomplex numbers), which replace real-valued matrix multiplications in fully-connected layers with Hamilton products of quaternions, both enjoy parameter savings with only 1/4 learnable parameters and achieve comparable performance in various applications. However, one key caveat is that hypercomplex space only exists at very few predefined dimensions (4D, 8D, and 16D). This restricts the flexibility of models that leverage hypercomplex multiplications. To this end, we propose parameterizing hypercomplex multiplications, allowing models to learn multiplication rules from data regardless of whether such rules are predefined. As a result, our method not only subsumes the Hamilton product, but also learns to operate on any arbitrary <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>D hypercomplex space, providing more architectural flexibility using arbitrarily <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><mi>n</mi></math></mjx-assistive-mml></mjx-container> learnable parameters compared with the fully-connected layer counterpart. Experiments of applications to the LSTM and transformer models on natural language inference, machine translation, text style transfer, and subject verb agreement demonstrate architectural flexibility and effectiveness of the proposed approach.",
+    "title": "Beyond Fully-Connected Layers with Quaternions: Parameterization of Hypercomplex Multiplications with 1/n Parameters",
+    "authors": [
+      "Aston Zhang",
+      "Yi Tay",
+      "SHUAI Zhang",
+      "Alvin Chan",
+      "Anh Tuan Luu",
+      "Siu Hui",
+      "Jie Fu"
+    ],
+    "emails": [
+      "~Aston_Zhang2",
+      "~Yi_Tay1",
+      "~SHUAI_Zhang5",
+      "~Jie_Fu2",
+      "~Alvin_Chan1",
+      "~Anh_Tuan_Luu2",
+      "~Siu_Hui1"
+    ],
+    "rank": 11
+  },
+  {
+    "url": "https://openreview.net/forum?id=04LZCAxMSco",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      9,
+      8
+    ],
+    "rating": "8.00",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "We consider the problem of learning a latent <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-vertex simplex <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup></math></mjx-assistive-mml></mjx-container>, given <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">A</mi></mrow><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><mi>d</mi><mo>\u00d7</mo><mi>n</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>, which can be viewed as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> data points that are formed by randomly perturbing some latent points in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>, possibly beyond <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>. A large class of latent variable models, such as adversarial clustering, mixed membership stochastic block models, and topic models can be cast in this view of learning a latent simplex. Bhattacharyya and Kannan (SODA 2020) give an algorithm for learning such a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-vertex latent simplex in time roughly <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-mtext class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c7A\"></mjx-c></mjx-mtext><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>k</mi><mo>\u22c5</mo><mtext>nnz</mtext><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">A</mi></mrow><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c7A\"></mjx-c></mjx-mtext><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext>nnz</mtext><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">A</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> is the number of non-zeros in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">A</mi></mrow></math></mjx-assistive-mml></mjx-container>. We show that the dependence on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> in the running time is unnecessary given a natural assumption about the mass of the top <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> singular values of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">A</mi></mrow></math></mjx-assistive-mml></mjx-container>, which holds in many of these applications. Further, we show this assumption is necessary, as otherwise an algorithm for learning a latent simplex would imply a better low rank approximation algorithm than what is known. \n\nWe obtain a spectral low-rank approximation to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"13\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">A</mi></mrow></math></mjx-assistive-mml></mjx-container> in input-sparsity time and show that the column space thus obtained has small <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"14\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c73\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c398\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>sin</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi mathvariant=\"normal\">\u0398</mi></math></mjx-assistive-mml></mjx-container> (angular) distance to the right top-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"15\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> singular space of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"16\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">A</mi></mrow></math></mjx-assistive-mml></mjx-container>. Our algorithm then selects <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"17\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> points in the low-rank  subspace with the largest inner product (in absolute value) with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"18\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> carefully chosen random vectors. By working in the low-rank subspace, we avoid reading the entire matrix in each iteration and thus circumvent the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"19\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c398\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-mtext class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c7A\"></mjx-c></mjx-mtext><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u0398</mi><mo stretchy=\"false\">(</mo><mi>k</mi><mo>\u22c5</mo><mtext>nnz</mtext><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">A</mi></mrow><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> running time.",
+    "title": "Learning a Latent Simplex in Input Sparsity Time",
+    "authors": [
+      "Ainesh Bakshi",
+      "Chiranjib Bhattacharyya",
+      "Ravi Kannan",
+      "David Woodruff",
+      "Samson Zhou"
+    ],
+    "emails": [
+      "~David_Woodruff1",
+      "~Chiranjib_Bhattacharyya1",
+      "~Ainesh_Bakshi1",
+      "~Samson_Zhou1",
+      "~Ravi_Kannan1"
+    ],
+    "rank": 12
+  },
+  {
+    "url": "https://openreview.net/forum?id=kmG8vRXTFv",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      7,
+      8
+    ],
+    "rating": "8.00",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Forecasting complex dynamical phenomena in settings where only partial knowledge of their dynamics is available is a prevalent problem across various scientific fields. While purely data-driven approaches are arguably insufficient in this context, standard physical modeling based approaches tend to be over-simplistic, inducing non-negligible errors. In this work, we introduce the APHYNITY framework, a principled approach for augmenting incomplete physical dynamics described by differential equations with deep data-driven models. It consists in decomposing the dynamics into two components: a physical component accounting for the dynamics for which we have some prior knowledge, and a data-driven component accounting for errors of the physical model. The learning problem is carefully formulated such that the physical model explains as much of the data as possible, while the data-driven component only describes information that cannot be captured by the physical model, no more, no less. This not only provides the existence and uniqueness for this decomposition, but also ensures interpretability and benefits generalization. Experiments made on three important use cases, each representative of a different family of phenomena, i.e. reaction-diffusion equations, wave equations and the non-linear damped pendulum, show that APHYNITY can efficiently leverage approximate physical models to accurately forecast the evolution of the system and correctly identify relevant physical parameters.",
+    "title": "Augmenting Physical Models with Deep Networks for Complex Dynamics Forecasting",
+    "authors": [
+      "Yuan Yin",
+      "Vincent LE GUEN",
+      "J\u00e9r\u00e9mie DONA",
+      "Emmanuel de Bezenac",
+      "Ibrahim Ayed",
+      "Nicolas THOME",
+      "patrick gallinari"
+    ],
+    "emails": [
+      "~patrick_gallinari1",
+      "~Ibrahim_Ayed1",
+      "~J\u00e9r\u00e9mie_DONA2",
+      "~Vincent_LE_GUEN1",
+      "~Nicolas_THOME2",
+      "~Yuan_Yin1",
+      "~Emmanuel_de_Bezenac1"
+    ],
+    "rank": 13
+  },
+  {
+    "url": "https://openreview.net/forum?id=m5Qsh0kBQG",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      7,
+      8,
+      9
+    ],
+    "rating": "7.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Discovering the underlying mathematical expressions describing a dataset is a core challenge for artificial intelligence. This is the problem of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">symbolic regression</mtext></math></mjx-assistive-mml></mjx-container>. Despite recent advances in training neural networks to solve complex tasks, deep learning approaches to symbolic regression are underexplored. We propose a framework that leverages deep learning for symbolic regression via a simple idea: use a large model to search the space of small models. Specifically, we use a recurrent neural network to emit a distribution over tractable mathematical expressions and employ a novel risk-seeking policy gradient to train the network to generate better-fitting expressions. Our algorithm outperforms several baseline methods (including Eureqa, the gold standard for symbolic regression) in its ability to exactly recover symbolic expressions on a series of benchmark problems, both with and without added noise. More broadly, our contributions include a framework that can be applied to optimize hierarchical, variable-length objects under a black-box performance metric, with the ability to incorporate constraints in situ, and a risk-seeking policy gradient formulation that optimizes for best-case performance instead of expected performance.",
+    "title": "Deep symbolic regression: Recovering mathematical expressions from data via risk-seeking policy gradients",
+    "authors": [
+      "Brenden K Petersen",
+      "Mikel Landajuela Larma",
+      "Terrell N. Mundhenk",
+      "Claudio Prata Santiago",
+      "Soo Kyung Kim",
+      "Joanne Taery Kim"
+    ],
+    "emails": [
+      "llnl.gov",
+      "~Terrell_N._Mundhenk1",
+      "~Brenden_K_Petersen1",
+      "~Soo_Kyung_Kim1"
+    ],
+    "rank": 14
+  },
+  {
+    "url": "https://openreview.net/forum?id=nIAxjsniDzg",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      9,
+      9,
+      7
+    ],
+    "rating": "7.86",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In recent years, reinforcement learning (RL) has been successfully applied to many different continuous control tasks. While RL algorithms are often conceptually simple, their state-of-the-art implementations take numerous low- and high-level design decisions that strongly affect the performance of the resulting agents. Those choices are usually not extensively discussed in the literature, leading to discrepancy between published descriptions of algorithms and their implementations. This makes it hard to attribute progress in RL and slows down overall progress [Engstrom'20]. As a step towards filling that gap, we implement &gt;50 such ``\"choices\" in a unified on-policy deep actor-critic framework, allowing us to investigate their impact in a large-scale empirical study. We train over 250'000 agents in five continuous control environments of different complexity and provide insights and practical recommendations for the training of on-policy deep actor-critic RL agents.",
+    "title": "What Matters for On-Policy Deep Actor-Critic Methods? A Large-Scale Study",
+    "authors": [
+      "Marcin Andrychowicz",
+      "Anton Raichuk",
+      "Piotr Sta\u0144czyk",
+      "Manu Orsini",
+      "Sertan Girgin",
+      "Rapha\u00ebl Marinier",
+      "Leonard Hussenot",
+      "Matthieu Geist",
+      "Olivier Pietquin",
+      "Marcin Michalski",
+      "Sylvain Gelly",
+      "Olivier Bachem"
+    ],
+    "emails": [
+      "~Matthieu_Geist1",
+      "~Leonard_Hussenot1",
+      "google.com",
+      "~Rapha\u00ebl_Marinier1",
+      "~Marcin_Michalski1",
+      "~Sylvain_Gelly1",
+      "~Olivier_Bachem1",
+      "~Anton_Raichuk1",
+      "~Marcin_Andrychowicz1",
+      "~Olivier_Pietquin1"
+    ],
+    "rank": 15
+  },
+  {
+    "url": "https://openreview.net/forum?id=VJnrYcnRc6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      10,
+      7
+    ],
+    "rating": "7.86",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Although deep learning has achieved appealing results on several machine learning tasks, most of the models are deterministic at inference, limiting their application to single-modal settings. We propose a novel general-purpose framework for conditional generation in multimodal spaces, that uses latent variables to model generalizable learning patterns while minimizing a family of regression cost functions. At inference, the latent variables are optimized to find solutions corresponding to multiple output modes.  Compared to existing generative solutions, our approach demonstrates faster and more stable convergence, and can learn better representations for downstream tasks. Importantly, it provides a simple generic model that can perform better than highly engineered pipelines tailored using domain expertise on a variety of tasks, while generating diverse outputs. Code available at <a href=\"https://github.com/samgregoost/cGML\" target=\"_blank\" rel=\"nofollow\">https://github.com/samgregoost/cGML</a>.",
+    "title": "Conditional Generative Modeling via Learning the Latent Space",
+    "authors": [
+      "Sameera Ramasinghe",
+      "Kanchana Nisal Ranasinghe",
+      "Salman Khan",
+      "Nick Barnes",
+      "Stephen Gould"
+    ],
+    "emails": [
+      "~Sameera_Ramasinghe1",
+      "~Stephen_Gould1",
+      "~Salman_Khan4",
+      "~Kanchana_Nisal_Ranasinghe1",
+      "~Nick_Barnes3"
+    ],
+    "rank": 16
+  },
+  {
+    "url": "https://openreview.net/forum?id=-2FCwDKRREu",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      9
+    ],
+    "rating": "7.83",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "We study how representation learning can accelerate reinforcement learning from rich observations, such as images, without relying either on domain knowledge or pixel-reconstruction. Our goal is to learn representations that provide for effective downstream control and invariance to task-irrelevant details. Bisimulation metrics quantify behavioral similarity between states in continuous MDPs, which we propose using to learn robust latent representations which encode only the task-relevant information from observations. Our method trains encoders such that distances in latent space equal bisimulation distances in state space. We demonstrate the effectiveness of our method at disregarding task-irrelevant information using modified visual MuJoCo tasks, where the background is replaced with moving distractors and natural videos, while achieving SOTA performance. We also test a first-person highway driving task where our method learns invariance to clouds, weather, and time of day. Finally, we provide generalization results drawn from properties of bisimulation metrics, and links to causal inference.",
+    "title": "Learning Invariant Representations for Reinforcement Learning without Reconstruction",
+    "authors": [
+      "Amy Zhang",
+      "Rowan Thomas McAllister",
+      "Roberto Calandra",
+      "Yarin Gal",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Rowan_Thomas_McAllister1",
+      "~Amy_Zhang1",
+      "~Yarin_Gal1",
+      "~Sergey_Levine1",
+      "~Roberto_Calandra1"
+    ],
+    "rank": 17
+  },
+  {
+    "url": "https://openreview.net/forum?id=PKubaeJkw3",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      10,
+      7,
+      7
+    ],
+    "rating": "7.83",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Differentiable Neural Architecture Search is one of the most popular Neural Architecture Search (NAS) methods for its search efficiency and simplicity, accomplished by jointly optimizing the model weight and architecture parameters in a weight-sharing supernet via gradient-based algorithms. At the end of the search phase, the operations with the largest architecture parameters will be selected to form the final architecture, with the implicit assumption that the values of architecture parameters reflect the operation strength. While much has been discussed about the supernet's optimization, the architecture selection process has received little attention. We provide empirical and theoretical analysis to show that the magnitude of architecture parameters does not necessarily indicate how much the operation contributes to the supernet's performance. We propose an alternative perturbation-based architecture selection that directly measures each operation's influence on the supernet. We re-evaluate several differentiable NAS methods with the proposed architecture selection and find that it is able to extract significantly improved architectures from the underlying supernets consistently. Furthermore, we find that several failure modes of DARTS can be greatly alleviated with the proposed selection method, indicating that much of the poor generalization observed in DARTS can be attributed to the failure of magnitude-based architecture selection rather than entirely the optimization of its supernet.",
+    "title": "Rethinking Architecture Selection in Differentiable NAS",
+    "authors": [
+      "Ruochen Wang",
+      "Minhao Cheng",
+      "Xiangning Chen",
+      "Xiaocheng Tang",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Minhao_Cheng1",
+      "~Cho-Jui_Hsieh1",
+      "~Xiaocheng_Tang1",
+      "~Xiangning_Chen1",
+      "~Ruochen_Wang2"
+    ],
+    "rank": 18
+  },
+  {
+    "url": "https://openreview.net/forum?id=FGqiDsBUKL0",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      7,
+      8
+    ],
+    "rating": "7.75",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Natural images are projections of 3D objects on a 2D image plane. While state-of-the-art 2D generative models like GANs show unprecedented quality in modeling the natural image manifold, it is unclear whether they implicitly capture the underlying 3D object structures. And if so, how could we exploit such knowledge to recover the 3D shapes of objects in the images? To answer these questions, in this work, we present the first attempt to directly mine 3D geometric cues from an off-the-shelf 2D GAN that is trained on RGB images only. Through our investigation, we found that such a pre-trained GAN indeed contains rich 3D knowledge and thus can be used to recover 3D shape from a single 2D image in an unsupervised manner. The core of our framework is an iterative strategy that explores and exploits diverse viewpoint and lighting variations in the GAN image manifold. The framework does not require 2D keypoint or 3D annotations, or strong assumptions on object shapes (e.g. shapes are symmetric), yet it successfully recovers 3D shapes with high precision for human faces, cats, cars, and buildings. The recovered 3D shapes immediately allow high-quality image editing like relighting and object rotation. We quantitatively demonstrate the effectiveness of our approach compared to previous methods in both 3D shape reconstruction and face rotation. Our code is available at <a href=\"https://github.com/XingangPan/GAN2Shape\" target=\"_blank\" rel=\"nofollow\">https://github.com/XingangPan/GAN2Shape</a>.",
+    "title": "Do 2D GANs Know 3D Shape? Unsupervised 3D Shape Reconstruction from 2D Image GANs",
+    "authors": [
+      "Xingang Pan",
+      "Bo Dai",
+      "Ziwei Liu",
+      "Chen Change Loy",
+      "Ping Luo"
+    ],
+    "emails": [
+      "~Ziwei_Liu1",
+      "~Chen_Change_Loy2",
+      "~Xingang_Pan1",
+      "~Ping_Luo2",
+      "~Bo_Dai2"
+    ],
+    "rank": 19
+  },
+  {
+    "url": "https://openreview.net/forum?id=roNqYL0_XP",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      9,
+      6,
+      6,
+      10
+    ],
+    "rating": "7.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Mesh-based simulations are central to modeling complex physical systems in many disciplines across science and engineering. Mesh representations support powerful numerical integration methods and their resolution can be adapted to strike favorable trade-offs between accuracy and efficiency. However, high-dimensional scientific simulations are very expensive to run, and solvers and parameters must often be tuned individually to each system studied.\nHere we introduce MeshGraphNets, a framework for learning mesh-based simulations using graph neural networks. Our model can be trained to pass messages on a mesh graph and to adapt the mesh discretization during forward simulation. Our results show it can accurately predict the dynamics of a wide range of physical systems, including aerodynamics, structural mechanics, and cloth. The model's adaptivity supports learning resolution-independent dynamics and can scale to more complex state spaces at test time. Our method is also highly efficient, running 1-2 orders of magnitude faster than the simulation on which it is trained. Our approach broadens the range of problems on which neural network simulators can operate and promises to improve the efficiency of complex, scientific modeling tasks.",
+    "title": "Learning Mesh-Based Simulation with Graph Networks",
+    "authors": [
+      "Tobias Pfaff",
+      "Meire Fortunato",
+      "Alvaro Sanchez-Gonzalez",
+      "Peter Battaglia"
+    ],
+    "emails": [
+      "~Peter_Battaglia1",
+      "~Meire_Fortunato1",
+      "~Alvaro_Sanchez-Gonzalez1",
+      "~Tobias_Pfaff1"
+    ],
+    "rank": 20
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mos9F9kDwkz",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      6,
+      8,
+      9
+    ],
+    "rating": "7.73",
+    "confidences": [
+      2,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Neural link predictors are immensely useful for identifying missing edges in large scale Knowledge Graphs. However, it is still not clear how to use these models for answering more complex queries that arise in a number of domains, such as queries using logical conjunctions (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2227\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2227</mo></math></mjx-assistive-mml></mjx-container>), disjunctions (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2228\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2228</mo></math></mjx-assistive-mml></mjx-container>) and existential quantifiers (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2203\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u2203</mi></math></mjx-assistive-mml></mjx-container>), while accounting for missing edges. In this work, we propose a framework for efficiently answering complex queries on incomplete Knowledge Graphs. We translate each query into an end-to-end differentiable objective, where the truth value of each atom is computed by a pre-trained neural link predictor. We then analyse two solutions to the optimisation problem, including gradient-based and combinatorial search. In our experiments, the proposed approach produces more accurate results than state-of-the-art methods --- black-box neural models trained on millions of generated queries --- without the need of training on a large and diverse set of complex queries. Using orders of magnitude less training data, we obtain relative improvements ranging from 8% up to 40% in Hits@3 across different knowledge graphs containing factual information. Finally, we demonstrate that it is possible to explain the outcome of our model in terms of the intermediate solutions identified for each of the complex query atoms. All our source code and datasets are available online, at <a href=\"https://github.com/uclnlp/cqd\" target=\"_blank\" rel=\"nofollow\">https://github.com/uclnlp/cqd</a>.",
+    "title": "Complex Query Answering with Neural Link Predictors",
+    "authors": [
+      "Erik Arakelyan",
+      "Daniel Daza",
+      "Pasquale Minervini",
+      "Michael Cochez"
+    ],
+    "emails": [
+      "~Michael_Cochez2",
+      "gmail.com",
+      "alumni.ucl.ac.uk",
+      "~Pasquale_Minervini1"
+    ],
+    "rank": 21
+  },
+  {
+    "url": "https://openreview.net/forum?id=lxHgXYN4bwl",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      8,
+      6,
+      9
+    ],
+    "rating": "7.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Various classes of Graph Neural Networks (GNN) have been proposed and shown to be successful in a wide range of applications with graph structured data. In this paper, we propose a theoretical framework able to compare the expressive power of these GNN architectures. The current universality theorems only apply to intractable classes of GNNs. Here, we prove the first approximation guarantees for practical GNNs, paving the way for a better understanding of their generalization. Our theoretical results are proved for invariant GNNs computing a graph embedding (permutation of the nodes of the input graph does not affect the output) and equivariant GNNs computing an embedding of the nodes (permutation of the input permutes the output). We show that Folklore Graph Neural Networks (FGNN), which are tensor based GNNs augmented with matrix multiplication are the most expressive architectures proposed so far for a given tensor order. We illustrate our results on the Quadratic Assignment Problem (a NP-Hard combinatorial problem) by showing that FGNNs are able to learn how to solve the problem, leading to much better average performances than existing algorithms (based on spectral, SDP or other GNNs architectures). On a practical side, we also implement masked tensors to handle batches of graphs of varying sizes. ",
+    "title": "Expressive Power of Invariant and Equivariant Graph Neural Networks",
+    "authors": [
+      "Waiss Azizian",
+      "marc lelarge"
+    ],
+    "emails": [
+      "~marc_lelarge1",
+      "ens.fr"
+    ],
+    "rank": 22
+  },
+  {
+    "url": "https://openreview.net/forum?id=iAX0l6Cz8ub",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      8
+    ],
+    "rating": "7.71",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "In adversarial machine learning, there was a common belief that robustness and accuracy hurt each other. The belief was challenged by recent studies where we can maintain the robustness and improve the accuracy. However, the other direction, whether we can keep the accuracy and improve the robustness, is conceptually and practically more interesting, since robust accuracy should be lower than standard accuracy for any model. In this paper, we show this direction is also promising. Firstly, we find even over-parameterized deep networks may still have insufficient model capacity, because adversarial training has an overwhelming smoothing effect. Secondly, given limited model capacity, we argue adversarial data should have unequal importance: geometrically speaking, a natural data point closer to/farther from the class boundary is less/more robust, and the corresponding adversarial data point should be assigned with larger/smaller weight. Finally, to implement the idea, we propose geometry-aware instance-reweighted adversarial training, where the weights are based on how difficult it is to attack a natural data point. Experiments show that our proposal boosts the robustness of standard adversarial training; combining two directions, we improve both robustness and accuracy of standard adversarial training.",
+    "title": "Geometry-aware Instance-reweighted Adversarial Training",
+    "authors": [
+      "Jingfeng Zhang",
+      "Jianing Zhu",
+      "Gang Niu",
+      "Bo Han",
+      "Masashi Sugiyama",
+      "Mohan Kankanhalli"
+    ],
+    "emails": [
+      "~Masashi_Sugiyama1",
+      "~Jianing_Zhu2",
+      "~Jingfeng_Zhang1",
+      "~Mohan_Kankanhalli1",
+      "~Bo_Han1",
+      "~Gang_Niu1"
+    ],
+    "rank": 23
+  },
+  {
+    "url": "https://openreview.net/forum?id=uAX8q61EVRu",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      9,
+      7
+    ],
+    "rating": "7.71",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "We present a neural rendering approach for binaural sound synthesis that can produce realistic and spatially accurate binaural sound in realtime. The network takes, as input, a single-channel audio source and synthesizes, as output, two-channel binaural sound, conditioned on the relative position and orientation of the listener with respect to the source. We investigate deficiencies of the l2-loss on raw waveforms in a theoretical analysis and introduce an improved loss that overcomes these limitations. In an empirical evaluation, we establish that our approach is the first to generate spatially accurate waveform outputs (as measured by real recordings) and outperforms existing approaches by a considerable margin, both quantitatively and in a perceptual study. Dataset and code are available online.",
+    "title": "Neural Synthesis of Binaural Speech From Mono Audio",
+    "authors": [
+      "Alexander Richard",
+      "Dejan Markovic",
+      "Israel D. Gebru",
+      "Steven Krenn",
+      "Gladstone Alexander Butler",
+      "Fernando Torre",
+      "Yaser Sheikh"
+    ],
+    "emails": [
+      "~Dejan_Markovic1",
+      "~Yaser_Sheikh1",
+      "~Gladstone_Alexander_Butler1",
+      "~Alexander_Richard1",
+      "~Israel_D._Gebru1",
+      "~Steven_Krenn1",
+      "~Fernando_Torre1"
+    ],
+    "rank": 24
+  },
+  {
+    "url": "https://openreview.net/forum?id=5k8F6UU39V",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      8,
+      8
+    ],
+    "rating": "7.71",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Entities are at the center of how we represent and aggregate knowledge. For instance, Encyclopedias such as Wikipedia are structured by entities (e.g., one per Wikipedia article). The ability to retrieve such entities given a query is fundamental for knowledge-intensive tasks such as entity linking and open-domain question answering. One way to understand current approaches is as classifiers among atomic labels, one for each entity. Their weight vectors are dense entity representations produced by encoding entity meta information such as their descriptions. This approach leads to several shortcomings: (i) context and entity affinity is mainly captured through a vector dot product, potentially missing fine-grained interactions between the two; (ii) a large memory footprint is needed to store dense representations when considering large entity sets; (iii) an appropriately hard set of negative data has to be subsampled at training time. In this work, we propose GENRE, the first system that retrieves entities by generating their unique names, left to right, token-by-token in an autoregressive fashion and conditioned on the context. This enables us to mitigate the aforementioned technical issues since: (i) the autoregressive formulation allows us to directly capture relations between context and entity name, effectively cross encoding both; (ii) the memory footprint is greatly reduced because the parameters of our encoder-decoder architecture scale with vocabulary size, not entity count; (iii) the exact softmax loss can be efficiently computed without the need to subsample negative data. We show the efficacy of the approach, experimenting with more than 20 datasets on entity disambiguation, end-to-end entity linking and document retrieval tasks, achieving new state-of-the-art or very competitive results while using a tiny fraction of the memory footprint of competing systems. Finally, we demonstrate that new entities can be added by simply specifying their unambiguous name. Code and pre-trained models at <a href=\"https://github.com/facebookresearch/GENRE\" target=\"_blank\" rel=\"nofollow\">https://github.com/facebookresearch/GENRE</a>.",
+    "title": "Autoregressive Entity Retrieval",
+    "authors": [
+      "Nicola De Cao",
+      "Gautier Izacard",
+      "Sebastian Riedel",
+      "Fabio Petroni"
+    ],
+    "emails": [
+      "~Fabio_Petroni2",
+      "~Gautier_Izacard1",
+      "~Nicola_De_Cao1",
+      "~Sebastian_Riedel1"
+    ],
+    "rank": 25
+  },
+  {
+    "url": "https://openreview.net/forum?id=QYjO70ACDK",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      9,
+      7,
+      7
+    ],
+    "rating": "7.71",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Sliced-Wasserstein distance (SW) and its variant, Max Sliced-Wasserstein distance (Max-SW), have been used widely in the recent years due to their fast computation and scalability even when the probability measures lie in a very high dimensional space. However, SW requires many unnecessary projection samples to approximate its value while Max-SW only uses the most important projection, which ignores the information of other useful directions. In order to account for these weaknesses, we propose a novel distance, named Distributional Sliced-Wasserstein distance (DSW), that finds an optimal distribution over projections that can balance between exploring distinctive projecting directions and the informativeness of projections themselves. We show that the DSW is a generalization of Max-SW, and it can be computed efficiently by searching for the optimal push-forward measure over a set of probability measures over the unit sphere satisfying certain regularizing constraints that favor distinct directions. Finally, we conduct extensive experiments with large-scale datasets to demonstrate the favorable performances of the proposed distances over the previous sliced-based distances in generative modeling applications.",
+    "title": "Distributional Sliced-Wasserstein and Applications to Generative Modeling",
+    "authors": [
+      "Khai Nguyen",
+      "Nhat Ho",
+      "Tung Pham",
+      "Hung Bui"
+    ],
+    "emails": [
+      "vinai.io",
+      "~Khai_Nguyen1",
+      "~Nhat_Ho1",
+      "~Hung_Bui1"
+    ],
+    "rank": 26
+  },
+  {
+    "url": "https://openreview.net/forum?id=tW4QEInpni",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      8,
+      7
+    ],
+    "rating": "7.70",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Inspired by human learning, researchers have proposed ordering examples during training based on their difficulty. Both curriculum learning, exposing a network to easier examples early in training, and anti-curriculum learning, showing the most difficult examples first, have been suggested as improvements to the standard i.i.d. training. In this work, we set out to investigate the relative benefits of ordered learning. We first investigate the implicit curricula resulting from architectural and optimization bias and find that samples are learned in a highly consistent order. Next, to quantify the benefit of explicit curricula, we conduct extensive experiments over thousands of orderings spanning three kinds of learning: curriculum, anti-curriculum, and random-curriculum -- in which the size of the training dataset is dynamically increased over time, but the examples are randomly ordered. We find that for standard benchmark datasets, curricula have only marginal benefits, and that randomly ordered samples perform as well or better than curricula and anti-curricula, suggesting that any benefit is entirely due to the dynamic training set size. Inspired by common use cases of curriculum learning in practice, we investigate the role of limited training time budget and noisy data in the success of curriculum learning. Our experiments demonstrate that curriculum, but not anti-curriculum or random ordering can indeed improve the performance either with limited training time budget or in the existence of noisy data.",
+    "title": "When Do Curricula Work?",
+    "authors": [
+      "Xiaoxia Wu",
+      "Ethan Dyer",
+      "Behnam Neyshabur"
+    ],
+    "emails": [
+      "~Ethan_Dyer1",
+      "~Behnam_Neyshabur1",
+      "~Xiaoxia_Wu1"
+    ],
+    "rank": 27
+  },
+  {
+    "url": "https://openreview.net/forum?id=Wj4ODo0uyCF",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      9,
+      7,
+      8
+    ],
+    "rating": "7.69",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Using a mix of shared and language-specific (LS) parameters has shown promise in multilingual neural machine  translation (MNMT), but the question of when and where LS capacity matters most is still under-studied. We offer such a study by proposing conditional language-specific routing (CLSR). CLSR employs hard binary gates conditioned on token representations to dynamically select LS or shared paths. By manipulating these gates, it can schedule LS capacity across sub-layers in MNMT subject to the guidance of translation signals and budget constraints. Moreover, CLSR can easily scale up to massively multilingual settings. Experiments with Transformer on OPUS-100 and WMT datasets show that: 1) MNMT is sensitive to both the amount and the position of LS modeling: distributing 10%-30% LS computation to the top and/or bottom encoder/decoder layers delivers the best performance; and 2) one-to-many translation benefits more from CLSR compared to many-to-one translation, particularly with unbalanced training data. Our study further verifies the trade-off between the shared capacity and LS capacity for multilingual translation. We corroborate our analysis by confirming the soundness of our findings as foundation of our improved multilingual Transformers. Source code and models are available at <a href=\"https://github.com/googleinterns/cct-m4\" target=\"_blank\" rel=\"nofollow\">https://github.com/googleinterns/cct-m4</a>.",
+    "title": "Share or Not? Learning to Schedule Language-Specific Capacity for Multilingual Translation",
+    "authors": [
+      "Biao Zhang",
+      "Ankur Bapna",
+      "Rico Sennrich",
+      "Orhan Firat"
+    ],
+    "emails": [
+      "~Rico_Sennrich1",
+      "~Ankur_Bapna1",
+      "~Biao_Zhang2",
+      "~Orhan_Firat1"
+    ],
+    "rank": 28
+  },
+  {
+    "url": "https://openreview.net/forum?id=PULSD5qI2N1",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      8,
+      8,
+      7
+    ],
+    "rating": "7.69",
+    "confidences": [
+      2,
+      5,
+      4,
+      2,
+      3
+    ],
+    "abstract": "We analyze the convergence of the averaged stochastic gradient descent for overparameterized two-layer neural networks for regression problems. It was recently found that a neural tangent kernel (NTK) plays an important role in showing the global convergence of gradient-based methods under the NTK regime, where the learning dynamics for overparameterized neural networks can be almost characterized by that for the associated reproducing kernel Hilbert space (RKHS). However, there is still room for a convergence rate analysis in the NTK regime. In this study, we show that the averaged stochastic gradient descent can achieve the minimax optimal convergence rate, with the global convergence guarantee, by exploiting the complexities of the target function and the RKHS associated with the NTK. Moreover, we show that the target function specified by the NTK of a ReLU network can be learned at the optimal convergence rate through a smooth approximation of a ReLU network under certain conditions.",
+    "title": "Optimal Rates for Averaged Stochastic Gradient Descent under Neural Tangent Kernel Regime",
+    "authors": [
+      "Atsushi Nitanda",
+      "Taiji Suzuki"
+    ],
+    "emails": [
+      "~Taiji_Suzuki1",
+      "~Atsushi_Nitanda1"
+    ],
+    "rank": 29
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z4R1vxLbRLO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      9
+    ],
+    "rating": "7.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We construct an experimental setup in which changing the scale of initialization strongly impacts the implicit regularization induced by SGD, interpolating from good generalization performance to completely memorizing the training set while making little progress on the test set. Moreover, we find that the extent and manner in which generalization ability is affected depends on the activation and loss function used, with sin activation being the most extreme. In the case of the homogeneous ReLU activation, we show that this behavior can be attributed to the loss function. Our empirical investigation reveals that increasing the scale of initialization correlates with misalignment of representations and gradients across examples in the same class. This insight allows us to device an alignment measure over gradients and representations which can capture this phenomenon. We demonstrate that our alignment measure correlates with generalization of deep models trained on image classification tasks.",
+    "title": "Extreme Memorization via Scale of Initialization",
+    "authors": [
+      "Harsh Mehta",
+      "Ashok Cutkosky",
+      "Behnam Neyshabur"
+    ],
+    "emails": [
+      "~Behnam_Neyshabur1",
+      "~Ashok_Cutkosky1",
+      "~Harsh_Mehta1"
+    ],
+    "rank": 30
+  },
+  {
+    "url": "https://openreview.net/forum?id=lVgB2FUbzuQ",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      9,
+      7,
+      7
+    ],
+    "rating": "7.67",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "The COVID-19 pandemic has spread rapidly worldwide, overwhelming manual contact tracing in many countries and resulting in widespread lockdowns for emergency containment. Large-scale digital contact tracing (DCT) has emerged as a potential solution to resume economic and social activity while minimizing spread of the virus. Various DCT methods have been proposed, each making trade-offs be-tween privacy, mobility restrictions, and public health. The most common approach, binary contact tracing (BCT), models infection as a binary event, informed only by an individual\u2019s test results, with corresponding binary recommendations that either all or none of the individual\u2019s contacts quarantine. BCT ignores the inherent uncertainty in contacts and the infection process, which could be used to tailor messaging to high-risk individuals, and prompt proactive testing or earlier warnings. It also does not make use of observations such as symptoms or pre-existing medical conditions, which could be used to make more accurate infectiousness predictions. In this paper, we use a recently-proposed COVID-19 epidemiological simulator to develop and test methods that can be deployed to a smartphone to locally and proactively predict an individual\u2019s infectiousness (risk of infecting others) based on their contact history and other information, while respecting strong privacy constraints. Predictions are used to provide personalized recommendations to the individual via an app, as well as to send anonymized messages to the individual\u2019s contacts, who use this information to better predict their own infectiousness, an approach we call proactive contact tracing (PCT). Similarly to other works, we find that compared to no tracing, all DCT methods tested are able to reduce spread of the disease and thus save lives, even at low adoption rates, strongly supporting a role for DCT methods in managing the pandemic. Further, we find a deep-learning based PCT method which improves over BCT for equivalent average mobility, suggesting PCT could help in safe re-opening and second-wave prevention.",
+    "title": "Predicting Infectiousness for Proactive Contact Tracing",
+    "authors": [
+      "Yoshua Bengio",
+      "Prateek Gupta",
+      "Tegan Maharaj",
+      "Nasim Rahaman",
+      "Martin Weiss",
+      "Tristan Deleu",
+      "Eilif Benjamin Muller",
+      "Meng Qu",
+      "victor schmidt",
+      "Pierre-luc St-charles",
+      "hannah alsdurf",
+      "Olexa Bilaniuk",
+      "david buckeridge",
+      "gaetan caron",
+      "pierre luc carrier",
+      "Joumana Ghosn",
+      "satya ortiz gagne",
+      "Christopher Pal",
+      "Irina Rish",
+      "Bernhard Sch\u00f6lkopf",
+      "abhinav sharma",
+      "Jian Tang",
+      "andrew williams"
+    ],
+    "emails": [
+      "~Olexa_Bilaniuk1",
+      "mila.quebec",
+      "uottawa.ca",
+      "~Tristan_Deleu1",
+      "~Nasim_Rahaman1",
+      "~Eilif_Benjamin_Muller1",
+      "mcgill.ca",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Pierre-luc_St-charles1",
+      "~Meng_Qu2",
+      "~Yoshua_Bengio1",
+      "~Christopher_Pal1",
+      "~Joumana_Ghosn1",
+      "~Tegan_Maharaj1",
+      "~Prateek_Gupta2",
+      "umontreal.ca",
+      "~Jian_Tang1",
+      "~Martin_Weiss4",
+      "~Irina_Rish1"
+    ],
+    "rank": 31
+  },
+  {
+    "url": "https://openreview.net/forum?id=NzTU59SYbNq",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      8,
+      7
+    ],
+    "rating": "7.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "We present a novel view on principal components analysis as a competitive game in which each approximate eigenvector is controlled by a player whose goal is to maximize their own utility function. We analyze the properties of this PCA game and the behavior of its gradient based updates. The resulting algorithm---which combines elements from Oja's rule with a  generalized Gram-Schmidt orthogonalization---is naturally decentralized and hence parallelizable through message passing. We demonstrate the scalability of the algorithm with experiments on large image datasets and neural network activations. We discuss how this new view of PCA as a differentiable game can lead to further algorithmic developments and insights.",
+    "title": "EigenGame: PCA as a Nash Equilibrium",
+    "authors": [
+      "Ian Gemp",
+      "Brian McWilliams",
+      "Claire Vernade",
+      "Thore Graepel"
+    ],
+    "emails": [
+      "~Thore_Graepel1",
+      "~Claire_Vernade1",
+      "~Brian_McWilliams2",
+      "~Ian_Gemp1"
+    ],
+    "rank": 32
+  },
+  {
+    "url": "https://openreview.net/forum?id=GJwMHetHc73",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      9
+    ],
+    "rating": "7.62",
+    "confidences": [
+      1,
+      4,
+      3
+    ],
+    "abstract": "We propose PermaKey, a novel approach to representation learning based on object keypoints. It leverages the predictability of local image regions from spatial neighborhoods to identify salient regions that correspond to object parts, which are then converted to keypoints. Unlike prior approaches, it utilizes predictability to discover object keypoints, an intrinsic property of objects. This ensures that it does not overly bias keypoints to focus on characteristics that are not unique to objects, such as movement, shape, colour etc.  We demonstrate the efficacy of PermaKey on Atari where it learns keypoints corresponding to the most salient object parts and is robust to certain visual distractors. Further, on downstream RL tasks in the Atari domain we demonstrate how agents equipped with our keypoints outperform those using competing alternatives, even on challenging environments with moving backgrounds or distractor objects.\n",
+    "title": "Unsupervised Object Keypoint Learning using Local Spatial Predictability",
+    "authors": [
+      "Anand Gopalakrishnan",
+      "Sjoerd van Steenkiste",
+      "J\u00fcrgen Schmidhuber"
+    ],
+    "emails": [
+      "~Anand_Gopalakrishnan1",
+      "~Sjoerd_van_Steenkiste1",
+      "~J\u00fcrgen_Schmidhuber1"
+    ],
+    "rank": 33
+  },
+  {
+    "url": "https://openreview.net/forum?id=LmUJqB1Cz8",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      9
+    ],
+    "rating": "7.62",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Safe and reliable electricity transmission in power grids is crucial for modern society. It is thus quite natural that there has been a growing interest in the automatic management of power grids, exempli\ufb01ed by the Learning to Run a Power Network Challenge (L2RPN), modeling the problem as a reinforcement learning (RL) task. However, it is highly challenging to manage a real-world scale power grid, mostly due to the massive scale of its state and action space. In this paper, we present an off-policy actor-critic approach that effectively tackles the unique challenges in power grid management by RL, adopting the hierarchical policy together with the afterstate representation. Our agent ranked \ufb01rst in the latest challenge (L2RPN WCCI 2020), being able to avoid disastrous situations while maintaining the highest level of operational ef\ufb01ciency in every test scenarios. This paper provides a formal description of the algorithmic aspect of our approach, as well as further experimental studies on diverse power grids.",
+    "title": "Winning the L2RPN Challenge: Power Grid Management via Semi-Markov Afterstate Actor-Critic",
+    "authors": [
+      "Deunsol Yoon",
+      "Sunghoon Hong",
+      "Byung-Jun Lee",
+      "Kee-Eung Kim"
+    ],
+    "emails": [
+      "~Sunghoon_Hong2",
+      "~Byung-Jun_Lee1",
+      "~Deunsol_Yoon1",
+      "~Kee-Eung_Kim4"
+    ],
+    "rank": 34
+  },
+  {
+    "url": "https://openreview.net/forum?id=qYda4oLEc1",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      9,
+      9
+    ],
+    "rating": "7.60",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper frames a general prediction system as an observer traveling around a continuous space, measuring values at some locations, and predicting them at others. The observer is completely agnostic about any particular task being solved; it cares only about measurement locations and their values. This perspective leads to a machine learning framework in which seemingly unrelated tasks can be solved by a single model, by embedding their input and output variables into a shared space. An implementation of the framework is developed in which these variable embeddings are learned jointly with internal model parameters. In experiments, the approach is shown to (1) recover intuitive locations of variables in space and time, (2) exploit regularities across related datasets with completely disjoint input and output spaces, and (3) exploit regularities across seemingly unrelated tasks, outperforming task-specific single-task models and multi-task learning alternatives. The results suggest that even seemingly unrelated tasks may originate from similar underlying processes, a fact that the traveling observer model can use to make better predictions.",
+    "title": "The Traveling Observer Model: Multi-task Learning Through Spatial Variable Embeddings",
+    "authors": [
+      "Elliot Meyerson",
+      "Risto Miikkulainen"
+    ],
+    "emails": [
+      "~Elliot_Meyerson1",
+      "~Risto_Miikkulainen1"
+    ],
+    "rank": 35
+  },
+  {
+    "url": "https://openreview.net/forum?id=a-xFK8Ymz5J",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      9,
+      8,
+      7
+    ],
+    "rating": "7.55",
+    "confidences": [
+      3,
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "In this work, we propose DiffWave, a versatile diffusion probabilistic model for conditional and unconditional waveform generation. The model is non-autoregressive, and converts the white noise signal into structured waveform through a Markov chain with a constant number of steps at synthesis. It is efficiently trained by optimizing a variant of variational bound on the data likelihood. DiffWave produces high-fidelity audios in different waveform generation tasks, including neural vocoding conditioned on mel spectrogram, class-conditional generation, and unconditional generation. We demonstrate that DiffWave matches a strong WaveNet vocoder in terms of speech quality (MOS: 4.44 versus 4.43), while synthesizing orders of magnitude faster. In particular, it significantly outperforms autoregressive and GAN-based waveform models in the challenging unconditional generation task in terms of audio quality and sample diversity from various automatic and human evaluations.",
+    "title": "DiffWave: A Versatile Diffusion Model for Audio Synthesis",
+    "authors": [
+      "Zhifeng Kong",
+      "Wei Ping",
+      "Jiaji Huang",
+      "Kexin Zhao",
+      "Bryan Catanzaro"
+    ],
+    "emails": [
+      "~Bryan_Catanzaro1",
+      "eng.ucsd.edu",
+      "~Wei_Ping1",
+      "baidu.com",
+      "~Jiaji_Huang1"
+    ],
+    "rank": 36
+  },
+  {
+    "url": "https://openreview.net/forum?id=rALA0Xo6yNJ",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7,
+      8
+    ],
+    "rating": "7.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Current reinforcement learning (RL) algorithms can be brittle and difficult to use, especially when learning goal-reaching behaviors from sparse rewards. Although supervised imitation learning provides a simple and stable alternative, it requires access to demonstrations from a human supervisor. In this paper, we study RL algorithms that use imitation learning to acquire goal reaching policies from scratch, without the need for expert demonstrations or a value function. In lieu of demonstrations, we leverage the property that any trajectory is a successful demonstration for reaching the final state in that same trajectory. We propose a simple algorithm in which an agent continually relabels and imitates the trajectories it generates to progressively learn goal-reaching behaviors from scratch. Each iteration, the agent collects new trajectories using the latest policy, and maximizes the likelihood of the actions along these trajectories under the goal that was actually reached, so as to improve the policy. We formally show that this iterated supervised learning procedure optimizes a bound on the RL objective, derive performance bounds of the learned policy, and empirically demonstrate improved goal-reaching performance and robustness over current RL algorithms in several benchmark tasks. ",
+    "title": "Learning to Reach Goals via Iterated Supervised Learning",
+    "authors": [
+      "Dibya Ghosh",
+      "Abhishek Gupta",
+      "Ashwin Reddy",
+      "Justin Fu",
+      "Coline Manon Devin",
+      "Benjamin Eysenbach",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Coline_Manon_Devin1",
+      "~Justin_Fu1",
+      "~Dibya_Ghosh1",
+      "~Benjamin_Eysenbach1",
+      "~Ashwin_Reddy1",
+      "~Abhishek_Gupta1",
+      "~Sergey_Levine1"
+    ],
+    "rank": 37
+  },
+  {
+    "url": "https://openreview.net/forum?id=RLRXCV6DbEJ",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      8,
+      7
+    ],
+    "rating": "7.53",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "We present a hierarchical VAE that, for the first time, generates samples quickly <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">and</mtext></math></mjx-assistive-mml></mjx-container> outperforms the PixelCNN in log-likelihood on all natural image benchmarks. We begin by observing that, in theory, VAEs can actually represent autoregressive models, as well as faster, better models if they exist, when made sufficiently deep. Despite this, autoregressive models have historically outperformed VAEs in log-likelihood. We test if insufficient depth explains why by scaling a VAE to greater stochastic depth than previously explored and evaluating it CIFAR-10, ImageNet, and FFHQ. In comparison to the PixelCNN, these very deep VAEs achieve higher likelihoods, use fewer parameters, generate samples thousands of times faster, and are more easily applied to high-resolution images. Qualitative studies suggest this is because the VAE learns efficient hierarchical visual representations. We release our source code and models at <a href=\"https://github.com/openai/vdvae\" target=\"_blank\" rel=\"nofollow\">https://github.com/openai/vdvae</a>.",
+    "title": "Very Deep VAEs Generalize Autoregressive Models and Can Outperform Them on Images",
+    "authors": [
+      "Rewon Child"
+    ],
+    "emails": [
+      "~Rewon_Child1"
+    ],
+    "rank": 38
+  },
+  {
+    "url": "https://openreview.net/forum?id=Jnspzp-oIZE",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      9,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "A common approach to define convolutions on meshes is to interpret them as a graph and apply graph convolutional networks (GCNs).  Such GCNs utilize isotropic kernels and are therefore insensitive to the relative orientation of vertices and thus to the geometry of the mesh as a whole. We propose Gauge Equivariant Mesh CNNs which generalize GCNs to apply anisotropic gauge equivariant kernels. Since the resulting features carry orientation information, we introduce a geometric message passing scheme defined by parallel transporting features over mesh edges. Our experiments validate the significantly improved expressivity of the proposed model over conventional GCNs and other methods.",
+    "title": "Gauge Equivariant Mesh CNNs: Anisotropic convolutions on geometric graphs",
+    "authors": [
+      "Pim De Haan",
+      "Maurice Weiler",
+      "Taco Cohen",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Pim_De_Haan1",
+      "~Maurice_Weiler1",
+      "~Max_Welling1",
+      "~Taco_Cohen1"
+    ],
+    "rank": 39
+  },
+  {
+    "url": "https://openreview.net/forum?id=rsf1z-JSj87",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7,
+      8
+    ],
+    "rating": "7.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Modern text-to-speech synthesis pipelines typically involve multiple processing stages, each of which is designed or learnt independently from the rest. In this work, we take on the challenging task of learning to synthesise speech from normalised text or phonemes in an end-to-end manner, resulting in models which operate directly on character or phoneme input sequences and produce raw speech audio outputs. Our proposed generator is feed-forward and thus efficient for both training and inference, using a differentiable alignment scheme based on token length prediction. It learns to produce high fidelity audio through a combination of adversarial feedback and prediction losses constraining the generated audio to roughly match the ground truth in terms of its total duration and mel-spectrogram. To allow the model to capture temporal variation in the generated audio, we employ soft dynamic time warping in the spectrogram-based prediction loss. The resulting model achieves a mean opinion score exceeding 4 on a 5 point scale, which is comparable to the state-of-the-art models relying on multi-stage training and additional supervision.",
+    "title": "End-to-end Adversarial Text-to-Speech",
+    "authors": [
+      "Jeff Donahue",
+      "Sander Dieleman",
+      "Mikolaj Binkowski",
+      "Erich Elsen",
+      "Karen Simonyan"
+    ],
+    "emails": [
+      "~Jeff_Donahue1",
+      "~Mikolaj_Binkowski1",
+      "~Erich_Elsen1",
+      "~Sander_Dieleman1",
+      "~Karen_Simonyan1"
+    ],
+    "rank": 40
+  },
+  {
+    "url": "https://openreview.net/forum?id=8PS8m9oYtNy",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.53",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Normalizing flows define a probability distribution by an explicit invertible transformation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D433 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">z</mi></mrow><mo>=</mo><mi>f</mi><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. In this work, we present implicit normalizing flows (ImpFlows), which generalize normalizing flows by allowing the mapping to be implicitly defined by the roots of an equation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D433 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-texatom space=\"2\" texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mn class=\"mjx-b\"><mjx-c class=\"mjx-c1D7CE TEX-B\"></mjx-c></mjx-mn></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">z</mi></mrow><mo>,</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">)</mo><mo>=</mo><mrow><mn mathvariant=\"bold\">0</mn></mrow></math></mjx-assistive-mml></mjx-container>. ImpFlows build on residual flows (ResFlows) with a proper balance between expressiveness and tractability. Through theoretical analysis, we show that the function space of ImpFlow is strictly richer than that of ResFlows. Furthermore, for any ResFlow with a fixed number of blocks, there exists some function that ResFlow has a non-negligible approximation error. However, the function is exactly representable by a single-block ImpFlow. We propose a scalable algorithm to train and draw samples from ImpFlows. Empirically, we evaluate ImpFlow on several classification and density modeling tasks, and ImpFlow outperforms ResFlow with a comparable amount of parameters on all the benchmarks.",
+    "title": "Implicit Normalizing Flows",
+    "authors": [
+      "Cheng Lu",
+      "Jianfei Chen",
+      "Chongxuan Li",
+      "Qiuhao Wang",
+      "Jun Zhu"
+    ],
+    "emails": [
+      "~Jun_Zhu2",
+      "~Chongxuan_Li1",
+      "~Jianfei_Chen1",
+      "~Qiuhao_Wang1",
+      "~Cheng_Lu5"
+    ],
+    "rank": 41
+  },
+  {
+    "url": "https://openreview.net/forum?id=3AOj0RCNC2",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      8,
+      6,
+      8
+    ],
+    "rating": "7.50",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "The ability to learn continually without forgetting the past tasks is a desired attribute for artificial learning systems. Existing approaches to enable such learning in artificial neural networks usually rely on network growth, importance based weight update or replay of old data from the memory. In contrast, we propose a novel approach where a neural network learns new tasks by taking gradient steps in the orthogonal direction to the gradient subspaces deemed important for the past tasks. We find the bases of these subspaces by analyzing network representations (activations) after learning each task with Singular Value Decomposition (SVD) in a single shot manner and store them in the memory as Gradient Projection Memory (GPM). With qualitative and quantitative analyses, we show that such orthogonal gradient descent induces minimum to no interference with the past tasks, thereby mitigates forgetting. We evaluate our algorithm on diverse image classification datasets with short and long sequences of tasks and report better or on-par performance compared to the state-of-the-art approaches. ",
+    "title": "Gradient Projection Memory for Continual Learning",
+    "authors": [
+      "Gobinda Saha",
+      "Isha Garg",
+      "Kaushik Roy"
+    ],
+    "emails": [
+      "~Isha_Garg1",
+      "~Gobinda_Saha1",
+      "~Kaushik_Roy1"
+    ],
+    "rank": 42
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ysuv-WOFeKR",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      6,
+      7,
+      8
+    ],
+    "rating": "7.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Reinforcement learning provides a general framework for flexible decision making and control, but requires extensive data collection for each new task that an agent needs to learn. In other machine learning fields, such as natural language processing or computer vision, pre-training on large, previously collected datasets to bootstrap learning for new tasks has emerged as a powerful paradigm to reduce data requirements when learning a new task. In this paper, we ask the following question: how can we enable similarly useful pre-training for RL agents? We propose a method for pre-training behavioral priors that can capture complex input-output relationships observed in successful trials from a wide range of previously seen tasks, and we show how this learned prior can be used for rapidly learning new tasks without impeding the RL agent's ability to try out novel behaviors. We demonstrate the effectiveness of our approach in challenging robotic manipulation domains involving image observations and sparse reward functions, where our method outperforms prior works by a substantial margin. Additional materials can be found on our project website: <a href=\"https://sites.google.com/view/parrot-rl\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/parrot-rl</a>",
+    "title": "Parrot: Data-Driven Behavioral Priors for Reinforcement Learning",
+    "authors": [
+      "Avi Singh",
+      "Huihan Liu",
+      "Gaoyue Zhou",
+      "Albert Yu",
+      "Nicholas Rhinehart",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Huihan_Liu1",
+      "~Gaoyue_Zhou1",
+      "~Albert_Yu1",
+      "~Nicholas_Rhinehart1",
+      "~Sergey_Levine1",
+      "~Avi_Singh1"
+    ],
+    "rank": 43
+  },
+  {
+    "url": "https://openreview.net/forum?id=dYeAHXnpWJ4",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      9,
+      7,
+      5
+    ],
+    "rating": "7.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Current methods for the interpretability of discriminative deep neural networks commonly rely on the model's input-gradients, i.e., the gradients of the output logits w.r.t. the inputs. The common assumption is that these input-gradients contain information regarding <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D703 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2223\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mrow><mi>\u03b8</mi></mrow></msub><mo stretchy=\"false\">(</mo><mi>y</mi><mo>\u2223</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, the model's discriminative capabilities, thus justifying their use for interpretability. However, in this work, we show that these input-gradients can be arbitrarily manipulated as a consequence of the shift-invariance of softmax without changing the discriminative function. This leaves an open question: given that input-gradients can be arbitrary, why are they highly structured and explanatory in standard models?\n\nIn this work, we re-interpret the logits of standard softmax-based classifiers as unnormalized log-densities of the data distribution and show that input-gradients can be viewed as gradients of a class-conditional generative model <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D703 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2223\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mrow><mi>\u03b8</mi></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo>\u2223</mo><mi>y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> implicit in the discriminative model. This leads us to hypothesize that the highly structured and explanatory nature of input-gradients may be due to the alignment of this class-conditional model <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D703 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2223\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mrow><mi>\u03b8</mi></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo>\u2223</mo><mi>y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> with that of the ground truth data distribution <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-c64\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c><mjx-c class=\"mjx-c74\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c></mjx-mtext></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2223\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mrow><mtext>data</mtext></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo>\u2223</mo><mi>y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. We test this hypothesis by studying the effect of density alignment on gradient explanations. To achieve this density alignment, we use an algorithm called score-matching, and propose novel approximations to this algorithm to enable training large-scale models.\n\nOur experiments show that improving the alignment of the implicit density model with the data distribution enhances gradient structure and explanatory power while reducing this alignment has the opposite effect. This also leads us to conjecture that unintended density alignment in standard neural network training may explain the highly structured nature of input-gradients observed in practice. Overall, our finding that input-gradients capture information regarding an implicit generative model implies that we need to re-think their use for interpreting discriminative models.",
+    "title": "Rethinking the Role of Gradient-based Attribution Methods for Model Interpretability",
+    "authors": [
+      "Suraj Srinivas",
+      "Francois Fleuret"
+    ],
+    "emails": [
+      "unige.ch",
+      "~Suraj_Srinivas1"
+    ],
+    "rank": 44
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ua6zuk0WRH",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      8,
+      7
+    ],
+    "rating": "7.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We introduce Performers, Transformer architectures which can estimate regular (softmax) full-rank-attention Transformers with provable accuracy, but using only linear (as opposed to quadratic) space and time complexity, without relying on any priors such as sparsity or low-rankness. To approximate softmax attention-kernels, Performers use a novel Fast Attention Via positive Orthogonal Random features approach (FAVOR+), which may be of independent interest for scalable kernel methods. FAVOR+ can also be used to efficiently model kernelizable attention mechanisms beyond softmax. This representational power is crucial to accurately compare softmax with other kernels for the first time on large-scale tasks, beyond the reach of regular Transformers, and investigate optimal attention-kernels. Performers are linear architectures fully compatible with regular Transformers and with strong theoretical guarantees: unbiased or nearly-unbiased estimation of the attention matrix, uniform convergence and low  estimation variance. We tested Performers on a rich set of tasks stretching from pixel-prediction through text models to protein sequence modeling. We demonstrate competitive results with other examined efficient sparse and dense attention methods, showcasing effectiveness of the novel attention-learning paradigm leveraged by Performers. ",
+    "title": "Rethinking Attention with Performers",
+    "authors": [
+      "Krzysztof Marcin Choromanski",
+      "Valerii Likhosherstov",
+      "David Dohan",
+      "Xingyou Song",
+      "Andreea Gane",
+      "Tamas Sarlos",
+      "Peter Hawkins",
+      "Jared Quincy Davis",
+      "Afroz Mohiuddin",
+      "Lukasz Kaiser",
+      "David Benjamin Belanger",
+      "Lucy J Colwell",
+      "Adrian Weller"
+    ],
+    "emails": [
+      "~Lukasz_Kaiser1",
+      "~David_Dohan1",
+      "~Tamas_Sarlos1",
+      "~Krzysztof_Marcin_Choromanski1",
+      "cam.ac.uk",
+      "~Adrian_Weller1",
+      "google.com",
+      "~Xingyou_Song1",
+      "~David_Benjamin_Belanger1",
+      "~Andreea_Gane1",
+      "~Lucy_J_Colwell1",
+      "~Afroz_Mohiuddin1",
+      "~Jared_Quincy_Davis1"
+    ],
+    "rank": 45
+  },
+  {
+    "url": "https://openreview.net/forum?id=0XXpJ4OtjW",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      6,
+      9
+    ],
+    "rating": "7.50",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose a method for meta-learning reinforcement learning algorithms by searching over the space of computational graphs which compute the loss function for a value-based model-free RL agent to optimize. The learned algorithms are domain-agnostic and can generalize to new environments not seen during training. Our method can both learn from scratch and bootstrap off known existing algorithms, like DQN, enabling interpretable modifications which improve performance. Learning from scratch on simple classical control and gridworld tasks, our method rediscovers the temporal-difference (TD) algorithm. Bootstrapped from DQN, we highlight two learned algorithms which obtain good generalization performance over other classical control tasks, gridworld type tasks, and Atari games. The analysis of the learned algorithm behavior shows resemblance to recently proposed RL algorithms that address overestimation in value-based methods.",
+    "title": "Evolving Reinforcement Learning Algorithms",
+    "authors": [
+      "John D Co-Reyes",
+      "Yingjie Miao",
+      "Daiyi Peng",
+      "Esteban Real",
+      "Quoc V Le",
+      "Sergey Levine",
+      "Honglak Lee",
+      "Aleksandra Faust"
+    ],
+    "emails": [
+      "~Daiyi_Peng1",
+      "~Quoc_V_Le1",
+      "~Yingjie_Miao1",
+      "~John_D_Co-Reyes1",
+      "google.com",
+      "~Aleksandra_Faust1",
+      "~Honglak_Lee2",
+      "~Sergey_Levine1"
+    ],
+    "rank": 46
+  },
+  {
+    "url": "https://openreview.net/forum?id=xpx9zj7CUlY",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      8,
+      7
+    ],
+    "rating": "7.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The successes of deep learning, variational inference, and many other fields have been aided by specialized implementations of reverse-mode automatic differentiation (AD) to compute gradients of mega-dimensional objectives. The AD techniques underlying these tools were designed to compute exact gradients to numerical precision, but modern machine learning models are almost always trained with stochastic gradient descent. Why spend computation and memory on exact (minibatch) gradients only to use them for stochastic optimization? We develop a general framework and approach for randomized automatic differentiation (RAD), which can allow unbiased gradient estimates to be computed with reduced memory in return for variance. We examine limitations of the general approach, and argue that we must leverage problem specific structure to realize benefits. We develop RAD techniques for a variety of simple neural network architectures, and show that for a fixed memory budget, RAD converges in fewer iterations than using a small batch size for feedforward networks, and in a similar number for recurrent networks. We also show that RAD can be applied to scientific computing, and use it to develop a low-memory stochastic gradient method for optimizing the control parameters of a linear reaction-diffusion PDE representing a fission reactor.",
+    "title": "Randomized Automatic Differentiation",
+    "authors": [
+      "Deniz Oktay",
+      "Nick McGreivy",
+      "Joshua Aduol",
+      "Alex Beatson",
+      "Ryan P Adams"
+    ],
+    "emails": [
+      "~Ryan_P_Adams1",
+      "~Alex_Beatson1",
+      "princeton.edu",
+      "~Deniz_Oktay2"
+    ],
+    "rank": 47
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZPa2SyGcbwh",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      7,
+      8
+    ],
+    "rating": "7.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Label noise is frequently observed in real-world large-scale datasets. The noise is introduced due to a variety of reasons; it is heterogeneous and feature-dependent. Most existing approaches to handling noisy labels fall into two categories: they either assume an ideal feature-independent noise, or remain heuristic without theoretical guarantees. In this paper, we propose to target a new family of feature-dependent label noise, which is much more general than commonly used i.i.d. label noise and encompasses a broad spectrum of noise patterns. Focusing on this general noise family, we propose a progressive label correction algorithm that iteratively corrects labels and refines the model. We provide theoretical guarantees showing that for a wide variety of (unknown) noise patterns, a classifier trained with this strategy converges to be consistent with the Bayes classifier. In experiments, our method outperforms SOTA baselines and is robust to various noise types and levels.",
+    "title": "Learning with Feature-Dependent Label Noise: A Progressive Approach",
+    "authors": [
+      "Yikai Zhang",
+      "Songzhu Zheng",
+      "Pengxiang Wu",
+      "Mayank Goswami",
+      "Chao Chen"
+    ],
+    "emails": [
+      "~Mayank_Goswami1",
+      "~Songzhu_Zheng1",
+      "~Chao_Chen1",
+      "~Yikai_Zhang1",
+      "~Pengxiang_Wu1"
+    ],
+    "rank": 48
+  },
+  {
+    "url": "https://openreview.net/forum?id=xvxPuCkCNPO",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.50",
+    "confidences": [
+      3,
+      3,
+      4,
+      2
+    ],
+    "abstract": "We consider the problem of learning to communicate using multi-agent reinforcement learning (MARL). A common approach is to learn off-policy, using data sampled from a replay buffer. However, messages received in the past may not accurately reflect the current communication policy of each agent, and this complicates learning. We therefore introduce a 'communication correction' which accounts for the non-stationarity of observed communication induced by multi-agent learning. It works by relabelling the received message to make it likely under the communicator's current policy, and thus be a better reflection of the receiver's current environment. To account for cases in which agents are both senders and receivers, we introduce an ordered relabelling scheme. Our correction is computationally efficient and can be integrated with a range of off-policy algorithms. We find in our experiments that it substantially improves the ability of communicating MARL systems to learn across a variety of cooperative and competitive tasks.",
+    "title": "Correcting experience replay for multi-agent communication",
+    "authors": [
+      "Sanjeevan Ahilan",
+      "Peter Dayan"
+    ],
+    "emails": [
+      "~Peter_Dayan1",
+      "~Sanjeevan_Ahilan1"
+    ],
+    "rank": 49
+  },
+  {
+    "url": "https://openreview.net/forum?id=tilovEHA3YS",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      8,
+      7
+    ],
+    "rating": "7.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We consider the  problem of estimating the number of distinct elements in a large data set (or, equivalently, the support size of the distribution induced by the data set) from a random sample of its elements. The problem occurs in many applications, including biology, genomics, computer systems and linguistics. A line of research spanning the last decade resulted in algorithms that estimate the support up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00b1</mo><mi>\u03b5</mi><mi>n</mi></math></mjx-assistive-mml></mjx-container> from a sample of size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.421em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>log</mi><mn>2</mn></msup><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03b5</mi><mo stretchy=\"false\">)</mo><mo>\u22c5</mo><mi>n</mi><mrow><mo>/</mo></mrow><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the data set size.  Unfortunately, this bound is known to be tight, limiting further improvements to the complexity of this problem. In this paper we consider estimation algorithms augmented with a machine-learning-based predictor that, given any element, returns an estimation of  its frequency.  We show that if the predictor is correct up to a constant approximation factor, then the sample complexity can be reduced significantly,  to\n<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" display=\"true\" justify=\"left\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math display=\"true\" class=\"MJX-TEX\" aria-hidden=\"true\" style=\"margin-left: 0px;\"><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-cA0\"></mjx-c></mjx-mtext><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-msup space=\"3\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.413em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c398\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"block\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\" display=\"block\"><mtext>&nbsp;</mtext><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03b5</mi><mo stretchy=\"false\">)</mo><mo>\u22c5</mo><msup><mi>n</mi><mrow><mn>1</mn><mo>\u2212</mo><mi mathvariant=\"normal\">\u0398</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03b5</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></mrow></msup><mo>.</mo></math></mjx-assistive-mml></mjx-container>\nWe evaluate the proposed algorithms on a collection of data sets, using the neural-network based estimators from {Hsu et al, ICLR'19} as predictors. Our experiments  demonstrate substantial (up to 3x) improvements in the estimation accuracy compared to the state of the art algorithm.",
+    "title": "Learning-based Support Estimation in Sublinear Time",
+    "authors": [
+      "Talya Eden",
+      "Piotr Indyk",
+      "Shyam Narayanan",
+      "Ronitt Rubinfeld",
+      "Sandeep Silwal",
+      "Tal Wagner"
+    ],
+    "emails": [
+      "~Piotr_Indyk1",
+      "~Tal_Wagner1",
+      "~Ronitt_Rubinfeld1",
+      "mit.edu",
+      "gmail.com"
+    ],
+    "rank": 50
+  },
+  {
+    "url": "https://openreview.net/forum?id=0-uUGPbIjD",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7,
+      8
+    ],
+    "rating": "7.47",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Prior AI breakthroughs in complex games have focused on either the purely adversarial or purely cooperative settings. In contrast, Diplomacy is a game of shifting alliances that involves both cooperation and competition. For this reason, Diplomacy has proven to be a formidable research challenge. In this paper we describe an agent for the no-press variant of Diplomacy that combines supervised learning on human data with one-step lookahead search via regret minimization. Regret minimization techniques have been behind previous AI successes in adversarial games, most notably poker, but have not previously been shown to be successful in large-scale games involving cooperation. We show that our agent greatly exceeds the performance of past no-press Diplomacy bots, is unexploitable by expert humans, and ranks in the top 2% of human players when playing anonymous games on a popular Diplomacy website.",
+    "title": "Human-Level Performance in No-Press Diplomacy via Equilibrium Search",
+    "authors": [
+      "Jonathan Gray",
+      "Adam Lerer",
+      "Anton Bakhtin",
+      "Noam Brown"
+    ],
+    "emails": [
+      "~Anton_Bakhtin1",
+      "~Noam_Brown2",
+      "~Jonathan_Gray2",
+      "~Adam_Lerer1"
+    ],
+    "rank": 51
+  },
+  {
+    "url": "https://openreview.net/forum?id=30EvkP2aQLD",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      8,
+      7
+    ],
+    "rating": "7.46",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Offline reinforcement learning seeks to utilize offline (observational) data to guide the learning of (causal) sequential decision making strategies. The hope is that offline reinforcement learning coupled with function approximation methods (to deal with the curse of dimensionality) can provide a means to help alleviate the excessive sample complexity burden in modern sequential decision making problems. However, the extent to which this broader approach can be effective is not well understood, where the literature largely consists of sufficient conditions.\n\nThis work focuses on the basic question of what are necessary representational and distributional conditions that permit provable sample-efficient offline reinforcement learning. Perhaps surprisingly, our main result shows that even if: i) we have realizability in that the true value function of \\emph{every} policy is linear in a given set of features and 2) our off-policy data has good  coverage over all features (under a strong spectral condition), any algorithm still (information-theoretically) requires a number of offline samples that is exponential in the problem horizon to non-trivially estimate the value of \\emph{any} given policy. Our results highlight that sample-efficient offline policy evaluation is not possible unless significantly stronger conditions hold; such conditions include either having low distribution shift (where the offline data distribution is close to the distribution of the policy to be evaluated) or significantly stronger representational conditions (beyond realizability).",
+    "title": "What are the Statistical Limits of Offline RL with Linear Function Approximation?",
+    "authors": [
+      "Ruosong Wang",
+      "Dean Foster",
+      "Sham M. Kakade"
+    ],
+    "emails": [
+      "~Dean_Foster1",
+      "~Sham_M._Kakade1",
+      "~Ruosong_Wang1"
+    ],
+    "rank": 52
+  },
+  {
+    "url": "https://openreview.net/forum?id=mLcmdlEUxy-",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      9,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.46",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We explore the hypothesis that learning modular structures which reflect the dynamics of the environment can lead to better generalization and robustness to changes that only affect a few of the underlying causes. We propose Recurrent Independent Mechanisms (RIMs), a new recurrent architecture in which multiple groups of recurrent cells operate with nearly independent transition dynamics, communicate only sparingly through the bottleneck of attention, and compete with each other so they are updated only at time steps where they are most relevant.  We show that this leads to specialization amongst the RIMs, which in turn allows for remarkably improved generalization on tasks where some factors of variation differ systematically between training and evaluation.\n",
+    "title": "Recurrent Independent Mechanisms",
+    "authors": [
+      "Anirudh Goyal",
+      "Alex Lamb",
+      "Jordan Hoffmann",
+      "Shagun Sodhani",
+      "Sergey Levine",
+      "Yoshua Bengio",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Yoshua_Bengio1",
+      "~Jordan_Hoffmann1",
+      "~Anirudh_Goyal1",
+      "~Sergey_Levine1",
+      "~Alex_Lamb1",
+      "~Shagun_Sodhani1"
+    ],
+    "rank": 53
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rhsu5qD36cL",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      9,
+      7,
+      6,
+      8
+    ],
+    "rating": "7.44",
+    "confidences": [
+      3,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Classifying sequential data as early and as accurately as possible is a challenging yet critical problem, especially when a sampling cost is high. One algorithm that achieves this goal is the sequential probability ratio test (SPRT), which is known as Bayes-optimal: it can keep the expected number of data samples as small as possible, given the desired error upper-bound. However, the original SPRT makes two critical assumptions that limit its application in real-world scenarios: (i) samples are independently and identically distributed, and (ii) the likelihood of the data being derived from each class can be calculated precisely. Here, we propose the SPRT-TANDEM, a deep neural network-based SPRT algorithm that overcomes the above two obstacles. The SPRT-TANDEM sequentially estimates the log-likelihood ratio of two alternative hypotheses by leveraging a novel Loss function for Log-Likelihood Ratio estimation (LLLR) while allowing correlations up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c2115 TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi><mo stretchy=\"false\">(</mo><mo>\u2208</mo><mrow><mi mathvariant=\"double-struck\">N</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>  preceding samples. In tests on one original and two public video databases, Nosaic MNIST, UCF101, and SiW, the SPRT-TANDEM achieves statistically significantly better classification accuracy than other baseline classifiers, with a smaller number of data samples. The code and Nosaic MNIST are publicly available at <a href=\"https://github.com/TaikiMiyagawa/SPRT-TANDEM\" target=\"_blank\" rel=\"nofollow\">https://github.com/TaikiMiyagawa/SPRT-TANDEM</a>.",
+    "title": "Sequential Density Ratio Estimation for Simultaneous Optimization of Speed and Accuracy",
+    "authors": [
+      "Akinori F Ebihara",
+      "Taiki Miyagawa",
+      "Kazuyuki Sakurai",
+      "Hitoshi Imaoka"
+    ],
+    "emails": [
+      "~Akinori_F_Ebihara1",
+      "nec.com"
+    ],
+    "rank": 54
+  },
+  {
+    "url": "https://openreview.net/forum?id=wpSWuz_hyqA",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      8,
+      8
+    ],
+    "rating": "7.43",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Recent work has shown that large text-based neural language models acquire a surprising propensity for one-shot learning. Here, we show that an agent situated in a simulated 3D world, and endowed with a novel dual-coding external memory, can exhibit similar one-shot word learning when trained with conventional RL algorithms. After a single introduction to a novel object via visual perception and language (\"This is a dax\"), the agent can manipulate the object as instructed (\"Put the dax on the bed\"), combining short-term, within-episode knowledge of the nonsense word with long-term lexical and motor knowledge. We find that, under certain training conditions and with a particular memory writing mechanism, the agent's one-shot word-object binding generalizes to novel exemplars within the same ShapeNet category, and is effective in settings with unfamiliar numbers of objects. We further show how dual-coding memory can be exploited as a signal for intrinsic motivation, stimulating the agent to seek names for objects that may be useful later. Together, the results demonstrate that deep neural networks can exploit meta-learning, episodic memory and an explicitly multi-modal environment to account for 'fast-mapping', a fundamental pillar of human cognitive development and a potentially transformative capacity for artificial agents.   ",
+    "title": "Grounded Language Learning Fast and Slow",
+    "authors": [
+      "Felix Hill",
+      "Olivier Tieleman",
+      "Tamara von Glehn",
+      "Nathaniel Wong",
+      "Hamza Merzic",
+      "Stephen Clark"
+    ],
+    "emails": [
+      "~Hamza_Merzic1",
+      "~Nathaniel_Wong1",
+      "~Felix_Hill1",
+      "~Olivier_Tieleman1",
+      "google.com",
+      "~Stephen_Clark1"
+    ],
+    "rank": 55
+  },
+  {
+    "url": "https://openreview.net/forum?id=l0mSUROpwY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      9,
+      5,
+      8,
+      9
+    ],
+    "rating": "7.43",
+    "confidences": [
+      4,
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Proteins perform a large variety of functions in living organisms and thus play a key role in biology. However, commonly used algorithms in protein representation learning were not specifically designed for protein data, and are therefore not able to capture all relevant structural levels of a protein during learning. To fill this gap, we propose two new learning operators, specifically designed to process protein structures. First, we introduce a novel convolution operator that considers the primary, secondary, and tertiary structure of a protein by using <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>-D convolutions defined on both the Euclidean distance, as well as multiple geodesic distances between the atoms in a multi-graph. Second, we introduce a set of hierarchical pooling operators that enable multi-scale protein analysis. We further evaluate the accuracy of our algorithms on common downstream tasks, where we outperform state-of-the-art protein learning algorithms.",
+    "title": "Intrinsic-Extrinsic Convolution and Pooling for Learning on 3D Protein Structures",
+    "authors": [
+      "Pedro Hermosilla",
+      "Marco Sch\u00e4fer",
+      "Matej Lang",
+      "Gloria Fackelmann",
+      "Pere-Pau V\u00e1zquez",
+      "Barbora Kozlikova",
+      "Michael Krone",
+      "Tobias Ritschel",
+      "Timo Ropinski"
+    ],
+    "emails": [
+      "cs.upc.edu",
+      "fi.muni.cz",
+      "~Pedro_Hermosilla1",
+      "mail.muni.cz",
+      "~Timo_Ropinski2",
+      "uni-tuebingen.de",
+      "~Tobias_Ritschel1",
+      "uni-ulm.de"
+    ],
+    "rank": 56
+  },
+  {
+    "url": "https://openreview.net/forum?id=QfTXQiGYudJ",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      8
+    ],
+    "rating": "7.42",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "Convolutional Neural Networks (CNNs) have advanced existing medical systems for automatic disease diagnosis. However, a threat to these systems arises that adversarial attacks make CNNs vulnerable. Inaccurate diagnosis results make a negative influence on human healthcare. There is a need to investigate potential adversarial attacks to robustify deep medical diagnosis systems. On the other side, there are several modalities of medical images (e.g., CT, fundus, and endoscopic image) of which each type is significantly different from others. It is more challenging to generate adversarial perturbations for different types of medical images. In this paper, we propose an image-based medical adversarial attack method to consistently produce adversarial perturbations on medical images. The objective function of our method consists of a loss deviation term and a loss stabilization term. The loss deviation term increases the divergence between the CNN prediction of an adversarial example and its ground truth label. Meanwhile, the loss stabilization term ensures similar CNN predictions of this example and its smoothed input. From the perspective of the whole iterations for perturbation generation, the proposed loss stabilization term exhaustively searches the perturbation space to smooth the single spot for local optimum escape. We further analyze the KL-divergence of the proposed loss function and find that the loss stabilization term makes the perturbations updated towards a fixed objective spot while deviating from the ground truth. This stabilization ensures the proposed medical attack effective for different types of medical images while producing perturbations in small variance. Experiments on several medical image analysis benchmarks including the recent COVID-19 dataset show the stability of the proposed method.",
+    "title": "Stabilized Medical Image Attacks",
+    "authors": [
+      "Gege Qi",
+      "Lijun GONG",
+      "Yibing Song",
+      "Kai Ma",
+      "Yefeng Zheng"
+    ],
+    "emails": [
+      "~Yibing_Song1",
+      "~Gege_Qi1",
+      "~Lijun_GONG2",
+      "~Kai_Ma2",
+      "~Yefeng_Zheng2"
+    ],
+    "rank": 57
+  },
+  {
+    "url": "https://openreview.net/forum?id=WiGQBFuVRv",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      9,
+      6,
+      7
+    ],
+    "rating": "7.41",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Time series forecasting is often fundamental to scientific and engineering problems and enables decision making. With ever increasing data set sizes, a trivial solution to scale up predictions is to assume independence between interacting time series. However, modeling statistical dependencies can improve accuracy and enable analysis of interaction effects. Deep learning methods are well suited for this problem, but multi-variate models often assume a simple parametric distribution and do not scale to high dimensions. In this work we model the multi-variate temporal dynamics of time series via an autoregressive deep learning model, where the data distribution  is represented by a conditioned normalizing flow. This combination retains the power of autoregressive models, such as good performance in extrapolation into the future, with the flexibility of flows as a general purpose high-dimensional distribution model, while remaining computationally tractable. We show that it improves over the state-of-the-art for standard metrics on many real-world data sets with several thousand interacting time-series.",
+    "title": "Multivariate Probabilistic Time Series Forecasting via Conditioned Normalizing Flows",
+    "authors": [
+      "Kashif Rasul",
+      "Abdul-Saboor Sheikh",
+      "Ingmar Schuster",
+      "Urs M Bergmann",
+      "Roland Vollgraf"
+    ],
+    "emails": [
+      "~Roland_Vollgraf1",
+      "zalando.de",
+      "~Urs_M_Bergmann1",
+      "~Abdul-Saboor_Sheikh1"
+    ],
+    "rank": 58
+  },
+  {
+    "url": "https://openreview.net/forum?id=yWkP7JuHX1",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      8,
+      6
+    ],
+    "rating": "7.40",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Differentiable rendering has paved the way to training neural networks to perform \u201cinverse graphics\u201d tasks such as predicting 3D geometry from monocular photographs. To train high performing models, most of the current approaches rely on multi-view imagery which are not readily available in practice.  Recent Generative Adversarial Networks (GANs) that synthesize images, in contrast, seem to acquire 3D knowledge implicitly during training: object viewpoints can be manipulated by simply manipulating the latent codes. However, these latent codes often lack further physical interpretation and thus GANs cannot easily be inverted to perform explicit 3D reasoning. In this paper, we aim to extract and disentangle 3D knowledge learned by generative models by utilizing differentiable renderers. Key to our approach is to exploit GANs as a multi-view data generator to train an inverse graphics network using an off-the-shelf differentiable renderer, and the trained inverse graphics network as a teacher to disentangle the GAN's latent code into interpretable 3D properties. The entire architecture is trained iteratively using cycle consistency losses. We show that our approach significantly outperforms state-of-the-art inverse graphics networks trained on existing datasets, both quantitatively and via user studies. We further showcase the disentangled GAN as a controllable 3D \u201cneural renderer\", complementing traditional graphics renderers.",
+    "title": "Image GANs meet Differentiable Rendering for Inverse Graphics and Interpretable 3D Neural Rendering",
+    "authors": [
+      "Yuxuan Zhang",
+      "Wenzheng Chen",
+      "Huan Ling",
+      "Jun Gao",
+      "Yinan Zhang",
+      "Antonio Torralba",
+      "Sanja Fidler"
+    ],
+    "emails": [
+      "~Antonio_Torralba1",
+      "~Wenzheng_Chen1",
+      "~Jun_Gao3",
+      "~Yinan_Zhang2",
+      "~Huan_Ling1",
+      "~Yuxuan_Zhang1",
+      "~Sanja_Fidler1"
+    ],
+    "rank": 59
+  },
+  {
+    "url": "https://openreview.net/forum?id=KvyxFqZS_D",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      9,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.40",
+    "confidences": [
+      2,
+      3,
+      2,
+      3
+    ],
+    "abstract": "In the mean field regime, neural networks are appropriately scaled so that as the width tends to infinity, the learning dynamics tends to a nonlinear and nontrivial dynamical limit, known as the mean field limit. This lends a way to study large-width neural networks via analyzing the mean field limit. Recent works have successfully applied such analysis to two-layer networks and provided global convergence guarantees. The extension to multilayer ones however has been a highly challenging puzzle, and little is known about the optimization efficiency in the mean field regime when there are more than two layers.\n\nIn this work, we prove a global convergence result for unregularized feedforward three-layer networks in the mean field regime. We first develop a rigorous framework to establish the mean field limit of three-layer networks under stochastic gradient descent training. To that end, we propose the idea of a neuronal embedding, which comprises of a fixed probability space that encapsulates neural networks of arbitrary sizes. The identified mean field limit is then used to prove a global convergence guarantee under suitable regularity and convergence mode assumptions, which \u2013 unlike previous works on two-layer networks \u2013 does not rely critically on convexity. Underlying the result is a universal approximation property, natural of neural networks, which importantly is shown to hold at any finite training time (not necessarily at convergence) via an algebraic topology argument.",
+    "title": "Global Convergence of Three-layer Neural Networks in the Mean Field Regime",
+    "authors": [
+      "Huy Tuan Pham",
+      "Phan-Minh Nguyen"
+    ],
+    "emails": [
+      "~Phan-Minh_Nguyen1",
+      "stanford.edu"
+    ],
+    "rank": 60
+  },
+  {
+    "url": "https://openreview.net/forum?id=2m0g1wEafh",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      8,
+      8
+    ],
+    "rating": "7.38",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Establishing a theoretical analysis that explains why deep learning can outperform shallow learning such as kernel methods is one of the biggest issues in the deep learning literature. Towards answering this question, we evaluate excess risk of a deep learning estimator trained by a noisy gradient descent with ridge regularization on a mildly overparameterized neural network, \nand discuss its superiority to a class of linear estimators that includes neural tangent kernel approach, random feature model, other kernel methods, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-NN estimator and so on. We consider a teacher-student regression model, and eventually show that {\\it any} linear estimator can be outperformed by deep learning in a sense of the minimax optimal rate especially for a high dimension setting. The obtained excess bounds are so-called fast learning rate which is faster than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.281em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><msqrt><mi>n</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> that is obtained by usual Rademacher complexity analysis. This discrepancy is induced by the non-convex geometry of the model and the noisy gradient descent used for neural network training provably reaches a near global optimal solution even though the loss landscape is highly non-convex. Although the noisy gradient descent does not employ any explicit or implicit sparsity inducing regularization, it shows a preferable generalization performance that dominates linear estimators.",
+    "title": "Benefit of deep learning with non-convex noisy gradient descent: Provable excess risk bound and superiority to kernel methods",
+    "authors": [
+      "Taiji Suzuki",
+      "Shunta Akiyama"
+    ],
+    "emails": [
+      "mist.i.u-tokyo.ac.jp",
+      "~Taiji_Suzuki1"
+    ],
+    "rank": 61
+  },
+  {
+    "url": "https://openreview.net/forum?id=6Tm1mposlrM",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      8,
+      8
+    ],
+    "rating": "7.38",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "In today's heavily overparameterized models, the value of the training loss provides few guarantees on model generalization ability. Indeed, optimizing only the training loss value, as is commonly done, can easily lead to suboptimal model quality. Motivated by the connection between geometry of the loss landscape and generalization---including a generalization bound that we prove here---we introduce a novel, effective procedure for instead simultaneously minimizing loss value and loss sharpness.  In particular, our procedure, Sharpness-Aware Minimization (SAM), seeks parameters that lie in neighborhoods having uniformly low loss; this formulation results in a min-max optimization problem on which gradient descent can be performed efficiently. We present empirical results showing that SAM improves model generalization across a variety of benchmark datasets (e.g., CIFAR-{10, 100}, ImageNet, finetuning tasks) and models, yielding novel state-of-the-art performance for several.  Additionally, we find that SAM natively provides robustness to label noise on par with that provided by state-of-the-art procedures that specifically target learning with noisy labels.",
+    "title": "Sharpness-aware Minimization for Efficiently Improving Generalization",
+    "authors": [
+      "Pierre Foret",
+      "Ariel Kleiner",
+      "Hossein Mobahi",
+      "Behnam Neyshabur"
+    ],
+    "emails": [
+      "google.com",
+      "~Pierre_Foret1",
+      "~Hossein_Mobahi2",
+      "~Behnam_Neyshabur1"
+    ],
+    "rank": 62
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ud3DSz72nYR",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7
+    ],
+    "rating": "7.36",
+    "confidences": [
+      2,
+      4,
+      5
+    ],
+    "abstract": "We investigate a deep reinforcement learning (RL) architecture that supports explaining why a learned agent prefers one action over another. The key idea is to learn action-values that are directly represented via human-understandable properties of expected futures. This is realized via the embedded self-prediction (ESP) model, which learns said properties in terms of human provided features. Action preferences can then be explained by contrasting the future properties predicted for each action. To address cases where there are a large number of features, we develop a novel method for computing minimal sufficient explanations from an ESP. Our case studies in three domains, including a complex strategy game, show that ESP models can be effectively learned and support insightful explanations. ",
+    "title": "Contrastive Explanations for Reinforcement Learning via Embedded Self Predictions",
+    "authors": [
+      "Zhengxian Lin",
+      "Kin-Ho Lam",
+      "Alan Fern"
+    ],
+    "emails": [
+      "~Alan_Fern1",
+      "~Zhengxian_Lin1",
+      "~Kin-Ho_Lam1"
+    ],
+    "rank": 63
+  },
+  {
+    "url": "https://openreview.net/forum?id=m1CD7tPubNy",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      8,
+      7
+    ],
+    "rating": "7.36",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We show how feature maps in convolutional networks are susceptible to spatial bias. Due to a combination of architectural choices, the activation at certain locations is systematically elevated or weakened. The major source of this bias is the padding mechanism. Depending on several aspects of convolution arithmetic, this mechanism can apply the padding unevenly, leading to asymmetries in the learned weights. We demonstrate how such bias can be detrimental to certain tasks such as small object detection: the activation is suppressed if the stimulus lies in the impacted area, leading to blind spots and misdetection. We explore alternative padding methods and propose solutions for analyzing and mitigating spatial bias.\n",
+    "title": "Mind the Pad -- CNNs Can Develop Blind Spots",
+    "authors": [
+      "Bilal Alsallakh",
+      "Narine Kokhlikyan",
+      "Vivek Miglani",
+      "Jun Yuan",
+      "Orion Reblitz-Richardson"
+    ],
+    "emails": [
+      "~Bilal_Alsallakh1",
+      "nyu.edu",
+      "fb.com"
+    ],
+    "rank": 64
+  },
+  {
+    "url": "https://openreview.net/forum?id=F3s69XzWOia",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.36",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Circuits of biological neurons, such as in the functional parts of the brain can be modeled as networks of coupled oscillators. Inspired by the ability of these systems to express a rich set of outputs while keeping (gradients of) state variables bounded, we propose a novel architecture for recurrent neural networks. Our proposed RNN is based on a time-discretization of a system of second-order ordinary differential equations, modeling networks of controlled nonlinear oscillators. We prove precise bounds on the gradients of the hidden states, leading to the mitigation of the exploding and vanishing gradient problem for this RNN. Experiments show that the proposed RNN is comparable in performance to the state of the art on a variety of benchmarks, demonstrating the potential of this architecture to provide stable and accurate RNNs for processing complex sequential data.",
+    "title": "Coupled Oscillatory Recurrent Neural Network (coRNN): An accurate and (gradient) stable architecture for learning long time dependencies",
+    "authors": [
+      "T. Konstantin Rusch",
+      "Siddhartha Mishra"
+    ],
+    "emails": [
+      "~Siddhartha_Mishra1",
+      "~T._Konstantin_Rusch1"
+    ],
+    "rank": 65
+  },
+  {
+    "url": "https://openreview.net/forum?id=9OHFhefeB86",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.36",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Geometric variations like rotation, scaling, and viewpoint changes pose a significant challenge to visual understanding. One common solution is to directly model certain intrinsic structures, e.g., using landmarks. However, it then becomes non-trivial to build effective deep models, especially when the underlying non-Euclidean grid is irregular and coarse. Recent deep models using graph convolutions provide an appropriate framework to handle such non-Euclidean data, but many of them, particularly those based on global graph Laplacians, lack expressiveness to capture local features required for representation of signals lying on the non-Euclidean grid. The current paper introduces a new type of graph convolution with learnable low-rank local filters, which is provably more expressive than previous spectral graph convolution methods. The model also provides a unified framework for both spectral and spatial graph convolutions. To improve model robustness, regularization by local graph Laplacians is introduced. The representation stability against input graph data perturbation is theoretically proved, making use of the graph filter locality and the local graph regularization. Experiments on spherical mesh data, real-world facial expression recognition/skeleton-based action recognition data, and data with simulated graph noise show the empirical advantage of the proposed model.",
+    "title": "Graph Convolution with Low-rank Learnable Local Filters",
+    "authors": [
+      "Xiuyuan Cheng",
+      "Zichen Miao",
+      "Qiang Qiu"
+    ],
+    "emails": [
+      "~Qiang_Qiu1",
+      "~Zichen_Miao1",
+      "~Xiuyuan_Cheng1"
+    ],
+    "rank": 66
+  },
+  {
+    "url": "https://openreview.net/forum?id=NECTfffOvn1",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      9,
+      6,
+      6
+    ],
+    "rating": "7.35",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Adiabatic quantum computation is a form of computation that acts by slowly interpolating a quantum system between an easy to prepare initial state and a final state that represents a solution to a given computational problem. The choice of the interpolation schedule is critical to the performance: if at a certain time point, the evolution is too rapid, the system has a high probability to transfer to a higher energy state, which does not represent a solution to the problem. On the other hand, an evolution that is too slow leads to a loss of computation time and increases the probability of failure due to decoherence. In this work, we train deep neural models to produce optimal schedules that are conditioned on the problem at hand.  We consider two types of problem representation: the Hamiltonian form, and the Quadratic Unconstrained Binary Optimization (QUBO) form. A novel loss function that scores schedules according to their approximated success probability is introduced. We benchmark our approach on random QUBO problems, Grover search, 3-SAT, and MAX-CUT problems and show that our approach outperforms, by a sizable margin, the linear schedules as well as alternative approaches that were very recently proposed.",
+    "title": "Fidelity-based Deep Adiabatic Scheduling",
+    "authors": [
+      "Eli Ovits",
+      "Lior Wolf"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Lior_Wolf1"
+    ],
+    "rank": 67
+  },
+  {
+    "url": "https://openreview.net/forum?id=3UDSdyIcBDA",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      8,
+      6
+    ],
+    "rating": "7.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Despite the existence of divergence examples, RMSprop remains \none of the most popular algorithms in machine learning. Towards closing the gap between theory and practice, we prove that RMSprop converges with proper choice of hyper-parameters under certain conditions. More specifically, we prove that when the hyper-parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u03b2</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> is close enough to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn></math></mjx-assistive-mml></mjx-container>, RMSprop and its random shuffling version converge to a bounded region in general, and to critical points in the interpolation regime. It is worth mentioning that our results do not depend on  ``bounded gradient\"  assumption, which is often the key assumption utilized by existing theoretical work for Adam-type adaptive gradient method. Removing this assumption allows us to establish a phase transition from divergence to non-divergence for RMSprop. \n\nFinally, based on our theory, we conjecture that in practice there is a critical threshold <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.271em;\"><mjx-mo class=\"mjx-ss\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo><mjx-spacer style=\"margin-top: 0.18em;\"></mjx-spacer><mjx-mn class=\"mjx-ss\" size=\"s\"><mjx-c class=\"mjx-c1D7E4 TEX-SS\"></mjx-c></mjx-mn></mjx-script></mjx-msubsup></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><msubsup><mi>\u03b2</mi><mn mathvariant=\"sans-serif\">2</mn><mo mathvariant=\"sans-serif\">\u2217</mo></msubsup></mrow></math></mjx-assistive-mml></mjx-container>, such that RMSprop generates reasonably good results only if <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-msub space=\"4\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.271em;\"><mjx-mo class=\"mjx-ss\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo><mjx-spacer style=\"margin-top: 0.18em;\"></mjx-spacer><mjx-mn class=\"mjx-ss\" size=\"s\"><mjx-c class=\"mjx-c1D7E4 TEX-SS\"></mjx-c></mjx-mn></mjx-script></mjx-msubsup></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mo>&gt;</mo><msub><mi>\u03b2</mi><mn>2</mn></msub><mo>\u2265</mo><mrow><msubsup><mi>\u03b2</mi><mn mathvariant=\"sans-serif\">2</mn><mo mathvariant=\"sans-serif\">\u2217</mo></msubsup></mrow></math></mjx-assistive-mml></mjx-container>. We provide empirical evidence for such a phase transition in our numerical experiments.",
+    "title": "RMSprop converges with proper hyper-parameter",
+    "authors": [
+      "Naichen Shi",
+      "Dawei Li",
+      "Mingyi Hong",
+      "Ruoyu Sun"
+    ],
+    "emails": [
+      "~Naichen_Shi1",
+      "~Ruoyu_Sun1",
+      "~Mingyi_Hong1",
+      "~Dawei_Li3"
+    ],
+    "rank": 68
+  },
+  {
+    "url": "https://openreview.net/forum?id=9YlaeLfuhJF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.33",
+    "confidences": [
+      4,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Classifiers in machine learning are often brittle when deployed. Particularly concerning are models with inconsistent performance on specific subgroups of a class, e.g., exhibiting disparities in skin cancer classification in the presence or absence of a spurious bandage. To mitigate these performance differences, we introduce model patching, a two-stage framework for improving robustness that encourages the model to be invariant to subgroup differences, and focus on class information shared by subgroups. Model patching first models subgroup features within a class and learns semantic transformations between them, and then trains a classifier with data augmentations that deliberately manipulate subgroup features. We instantiate model patching with CAMEL, which (1) uses a CycleGAN to learn the intra-class, inter-subgroup augmentations, and (2) balances subgroup performance using a theoretically-motivated subgroup consistency regularizer, accompanied by a new robust objective. We demonstrate CAMEL\u2019s effectiveness on 3 benchmark datasets, with reductions in robust error of up to 33% relative to the best baseline. Lastly, CAMEL successfully patches a model that fails due to spurious features on a real-world skin cancer dataset.",
+    "title": "Model Patching: Closing the Subgroup Performance Gap with Data Augmentation",
+    "authors": [
+      "Karan Goel",
+      "Albert Gu",
+      "Yixuan Li",
+      "Christopher Re"
+    ],
+    "emails": [
+      "~Albert_Gu1",
+      "~Karan_Goel1",
+      "~Yixuan_Li1",
+      "~Christopher_Re1"
+    ],
+    "rank": 69
+  },
+  {
+    "url": "https://openreview.net/forum?id=jWkw45-9AbL",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7
+    ],
+    "rating": "7.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "We propose a  Distributional  Approach for addressing  Controlled  Text  Generation from pre-trained Language Models (LM). This approach permits to specify, in a single formal framework, both \u201cpointwise\u2019\u201d and \u201cdistributional\u201d constraints over the target LM \u2014 to our knowledge, the first model with such generality \u2014while minimizing KL divergence from the initial LM distribution.  The optimal target distribution is then uniquely determined as an explicit EBM (Energy-BasedModel) representation. From that optimal representation, we then train a target controlled Autoregressive LM through an adaptive distributional variant of PolicyGradient.  We conduct a first set of experiments over pointwise constraints showing the advantages of our approach over a set of baselines, in terms of obtaining a controlled LM balancing constraint satisfaction with divergence from the pretrained LM.  We then perform experiments over distributional constraints, a unique feature of our approach, demonstrating its potential as a remedy to the problem of Bias in Language Models.  Through an ablation study, we show the effectiveness of our adaptive technique for obtaining faster convergence.\nCode available at <a href=\"https://github.com/naver/gdc\" target=\"_blank\" rel=\"nofollow\">https://github.com/naver/gdc</a>",
+    "title": "A Distributional Approach to Controlled Text Generation",
+    "authors": [
+      "Muhammad Khalifa",
+      "Hady Elsahar",
+      "Marc Dymetman"
+    ],
+    "emails": [
+      "~Hady_Elsahar2",
+      "~Muhammad_Khalifa2",
+      "~Marc_Dymetman1"
+    ],
+    "rank": 70
+  },
+  {
+    "url": "https://openreview.net/forum?id=ajOrOhQOsYx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      8,
+      6
+    ],
+    "rating": "7.33",
+    "confidences": [
+      1,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Group equivariant convolutional networks (GCNNs) endow classical convolutional networks with additional symmetry priors, which can lead to a considerably improved performance. Recent advances in the theoretical description of GCNNs revealed that such models can generally be understood as performing convolutions with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container>-steerable kernels, that is, kernels that satisfy an equivariance constraint themselves. While the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container>-steerability constraint has been derived, it has to date only been solved for specific use cases - a general characterization of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container>-steerable kernel spaces is still missing. This work provides such a characterization for the practically relevant case of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container> being any compact group. Our investigation is motivated by a striking analogy between the constraints underlying steerable kernels on the one hand and spherical tensor operators from quantum mechanics on the other hand. By generalizing the famous Wigner-Eckart theorem for spherical tensor operators, we prove that steerable kernel spaces are fully understood and parameterized in terms of 1) generalized reduced matrix elements, 2) Clebsch-Gordan coefficients, and 3) harmonic basis functions on homogeneous spaces.",
+    "title": "A Wigner-Eckart Theorem for Group Equivariant Convolution Kernels",
+    "authors": [
+      "Leon Lang",
+      "Maurice Weiler"
+    ],
+    "emails": [
+      "~Maurice_Weiler1",
+      "~Leon_Lang1"
+    ],
+    "rank": 71
+  },
+  {
+    "url": "https://openreview.net/forum?id=1FvkSpWosOl",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      8,
+      7,
+      6
+    ],
+    "rating": "7.33",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "As an essential ingredient of modern deep learning, attention mechanism, especially self-attention, plays a vital role in the global correlation discovery. However, is hand-crafted attention irreplaceable when modeling the global context? Our intriguing finding is that self-attention is not better than the matrix decomposition~(MD) model developed 20 years ago regarding the performance and computational cost for encoding the long-distance dependencies. We model the global context issue as a low-rank completion problem and show that its optimization algorithms can help design global information blocks. This paper then proposes a series of Hamburgers, in which we employ the optimization algorithms for solving MDs to factorize the input representations into sub-matrices and reconstruct a low-rank embedding. Hamburgers with different MDs can perform favorably against the popular global context module self-attention when carefully coping with gradients back-propagated through MDs. Comprehensive experiments are conducted in the vision tasks where it is crucial to learn the global context, including semantic segmentation and image generation, demonstrating significant improvements over self-attention and its variants. Code is available at <a href=\"https://github.com/Gsunshine/Enjoy-Hamburger\" target=\"_blank\" rel=\"nofollow\">https://github.com/Gsunshine/Enjoy-Hamburger</a>.",
+    "title": "Is Attention Better Than Matrix Decomposition?",
+    "authors": [
+      "Zhengyang Geng",
+      "Meng-Hao Guo",
+      "Hongxu Chen",
+      "Xia Li",
+      "Ke Wei",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Zhengyang_Geng1",
+      "~Hongxu_Chen2",
+      "~Xia_Li3",
+      "~Meng-Hao_Guo1",
+      "~Zhouchen_Lin1",
+      "~Ke_Wei1"
+    ],
+    "rank": 72
+  },
+  {
+    "url": "https://openreview.net/forum?id=v9c7hr9ADKx",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      9,
+      7
+    ],
+    "rating": "7.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent advances in multi-agent reinforcement learning have been largely limited in training one model from scratch for every new task. The limitation is due to the restricted model architecture related to fixed input and output dimensions. This hinders the experience accumulation and transfer of the learned agent over tasks with diverse levels of difficulty (e.g. 3 vs 3 or 5 vs 6 multi-agent games).  In this paper, we make the first attempt to explore a universal multi-agent reinforcement learning pipeline, designing one single architecture to fit tasks with the requirement of different observation and action configurations. Unlike previous RNN-based models, we utilize a transformer-based model to generate a flexible policy by decoupling the policy distribution from the intertwined input observation with an importance weight measured by the merits of the self-attention mechanism. Compared to a standard transformer block, the proposed model, named as Universal Policy Decoupling Transformer (UPDeT), further relaxes the action restriction and makes the multi-agent task's decision process more explainable. UPDeT is general enough to be plugged into any multi-agent reinforcement learning pipeline and equip them with strong generalization abilities that enables the handling of multiple tasks at a time. Extensive experiments on large-scale SMAC multi-agent competitive games demonstrate that the proposed UPDeT-based multi-agent reinforcement learning achieves significant results relative to state-of-the-art approaches, demonstrating advantageous transfer capability in terms of both performance and training speed (10 times faster).",
+    "title": "UPDeT: Universal Multi-agent RL via Policy Decoupling with Transformers",
+    "authors": [
+      "Siyi Hu",
+      "Fengda Zhu",
+      "Xiaojun Chang",
+      "Xiaodan Liang"
+    ],
+    "emails": [
+      "~Xiaojun_Chang3",
+      "~Fengda_Zhu1",
+      "~Siyi_Hu1",
+      "~Xiaodan_Liang2"
+    ],
+    "rank": 73
+  },
+  {
+    "url": "https://openreview.net/forum?id=O3Y56aqpChA",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      8,
+      6,
+      7
+    ],
+    "rating": "7.33",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Most few-shot learning techniques are pre-trained on a large, labeled \u201cbase dataset\u201d. In problem domains where such large labeled datasets are not available for pre-training (e.g., X-ray, satellite images), one must resort to pre-training in a different \u201csource\u201d problem domain (e.g., ImageNet), which can be very different from the desired target task. Traditional few-shot and transfer learning techniques fail in the presence of such extreme differences between the source and target tasks. In this paper, we present a simple and effective solution to tackle this extreme domain gap: self-training a source domain representation on unlabeled data from the target domain. We show that this improves one-shot performance on the target domain by 2.9 points on average on the challenging BSCD-FSL benchmark consisting of datasets from multiple domains.",
+    "title": "Self-training For Few-shot Transfer Across Extreme Task Differences",
+    "authors": [
+      "Cheng Perng Phoo",
+      "Bharath Hariharan"
+    ],
+    "emails": [
+      "~Cheng_Perng_Phoo1",
+      "~Bharath_Hariharan3"
+    ],
+    "rank": 74
+  },
+  {
+    "url": "https://openreview.net/forum?id=6puCSjH3hwA",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      8,
+      8,
+      6
+    ],
+    "rating": "7.33",
+    "confidences": [
+      2,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Image and video synthesis are closely related areas aiming at generating content from noise. While rapid progress has been demonstrated in improving image-based models to handle large resolutions, high-quality renderings, and wide variations in image content, achieving comparable video generation results remains problematic. We present a framework that leverages contemporary image generators to render high-resolution videos. We frame the video synthesis problem as discovering a trajectory in the latent space of a pre-trained and fixed image generator. Not only does such a framework render high-resolution videos, but it also is an order of magnitude more computationally efficient. We introduce a motion generator that discovers the desired trajectory, in which content and motion are disentangled. With such a representation, our framework allows for a broad range of applications, including content and motion manipulation. Furthermore, we introduce a new task, which we call cross-domain video synthesis, in which the image and motion generators are trained on disjoint datasets belonging to different domains. This allows for generating moving objects for which the desired video data is not available. Extensive experiments on various datasets demonstrate the advantages of our methods over existing video generation techniques. Code will be released at <a href=\"https://github.com/snap-research/MoCoGAN-HD\" target=\"_blank\" rel=\"nofollow\">https://github.com/snap-research/MoCoGAN-HD</a>.",
+    "title": "A Good Image Generator Is What You Need for High-Resolution Video Synthesis",
+    "authors": [
+      "Yu Tian",
+      "Jian Ren",
+      "Menglei Chai",
+      "Kyle Olszewski",
+      "Xi Peng",
+      "Dimitris N. Metaxas",
+      "Sergey Tulyakov"
+    ],
+    "emails": [
+      "~Jian_Ren2",
+      "~Kyle_Olszewski1",
+      "~Menglei_Chai1",
+      "~Yu_Tian2",
+      "~Dimitris_N._Metaxas1",
+      "~Sergey_Tulyakov1",
+      "~Xi_Peng1"
+    ],
+    "rank": 75
+  },
+  {
+    "url": "https://openreview.net/forum?id=j9Rv7qdXjd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      9
+    ],
+    "rating": "7.31",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Current neural architecture search (NAS) strategies focus only on finding a single, good, architecture. They offer little insight into why a specific network is performing well, or how we should modify the architecture if we want further improvements. We propose a Bayesian optimisation (BO) approach for NAS that combines the Weisfeiler-Lehman graph kernel with a Gaussian process surrogate. Our method not only optimises the architecture in a highly data-efficient manner, but also affords interpretability by discovering useful network features and their corresponding impact on the network performance. Moreover, our method is capable of capturing the topological structures of the architectures and is scalable to large graphs, thus making the high-dimensional and graph-like search spaces amenable to BO. We demonstrate empirically that our surrogate model is capable of identifying useful motifs which can guide the generation of new architectures. We finally show that our method outperforms existing NAS approaches to achieve the state of the art on both closed- and open-domain search spaces.",
+    "title": "Interpretable Neural Architecture Search via Bayesian Optimisation with Weisfeiler-Lehman Kernels",
+    "authors": [
+      "Binxin Ru",
+      "Xingchen Wan",
+      "Xiaowen Dong",
+      "Michael Osborne"
+    ],
+    "emails": [
+      "~Michael_Osborne1",
+      "~Binxin_Ru1",
+      "~Xiaowen_Dong1",
+      "~Xingchen_Wan1"
+    ],
+    "rank": 76
+  },
+  {
+    "url": "https://openreview.net/forum?id=OthEq8I5v1",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.31",
+    "confidences": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Reinforcement learning has been shown to be highly successful at many challenging tasks. However, success heavily relies on well-shaped rewards. Intrinsically motivated RL attempts to remove this constraint by defining an intrinsic reward function. Motivated by the self-consciousness concept in psychology, we make a natural assumption that the agent knows what constitutes itself, and propose a new intrinsic objective that encourages the agent to have maximum control on the environment. We mathematically formalize this reward as the mutual information between the agent state and the surrounding state under the current agent policy. With this new intrinsic motivation, we are able to outperform previous methods, including being able to complete the pick-and-place task for the first time without using any task reward. A video showing experimental results is available at <a href=\"https://youtu.be/AUCwc9RThpk\" target=\"_blank\" rel=\"nofollow\">https://youtu.be/AUCwc9RThpk</a>.",
+    "title": "Mutual Information State Intrinsic Control",
+    "authors": [
+      "Rui Zhao",
+      "Yang Gao",
+      "Pieter Abbeel",
+      "Volker Tresp",
+      "Wei Xu"
+    ],
+    "emails": [
+      "~Wei_Xu13",
+      "~Pieter_Abbeel2",
+      "~Volker_Tresp1",
+      "~Rui_Zhao1",
+      "~Yang_Gao1"
+    ],
+    "rank": 77
+  },
+  {
+    "url": "https://openreview.net/forum?id=MJIve1zgR_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      9,
+      7,
+      7
+    ],
+    "rating": "7.31",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Semi-supervised learning, i.e., training networks with both labeled and unlabeled data, has made significant progress recently. However, existing works have primarily focused on image classification tasks and neglected object detection which requires more annotation effort. In this work, we revisit the Semi-Supervised Object Detection (SS-OD) and identify the pseudo-labeling bias issue in SS-OD. To address this, we introduce Unbiased Teacher, a simple yet effective approach that jointly trains a student and a gradually progressing teacher in a mutually-beneficial manner. Together with a class-balance loss to downweight overly confident pseudo-labels, Unbiased Teacher consistently improved state-of-the-art methods by significant margins on COCO-standard, COCO-additional, and VOC datasets. Specifically, Unbiased Teacher achieves 6.8 absolute mAP improvements against state-of-the-art method when using 1% of labeled data on MS-COCO, achieves around 10 mAP improvements against the supervised baseline when using only 0.5, 1, 2% of labeled data on MS-COCO.",
+    "title": "Unbiased Teacher for Semi-Supervised Object Detection",
+    "authors": [
+      "Yen-Cheng Liu",
+      "Chih-Yao Ma",
+      "Zijian He",
+      "Chia-Wen Kuo",
+      "Kan Chen",
+      "Peizhao Zhang",
+      "Bichen Wu",
+      "Zsolt Kira",
+      "Peter Vajda"
+    ],
+    "emails": [
+      "~Chia-Wen_Kuo1",
+      "~Chih-Yao_Ma1",
+      "~Peter_Vajda1",
+      "~Peizhao_Zhang1",
+      "~Yen-Cheng_Liu1",
+      "~Kan_Chen1",
+      "~Zsolt_Kira1",
+      "fb.com",
+      "~Bichen_Wu1"
+    ],
+    "rank": 78
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mk6PZtgAgfq",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      8
+    ],
+    "rating": "7.30",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Gradient estimation in models with discrete latent variables is a challenging problem, because the simplest unbiased estimators tend to have high variance. To counteract this, modern estimators either introduce bias, rely on multiple function evaluations, or use learned, input-dependent baselines. Thus, there is a need for estimators that require minimal tuning, are computationally cheap, and have low mean squared error. In this paper, we show that the variance of the straight-through variant of the popular Gumbel-Softmax estimator can be reduced through Rao-Blackwellization without increasing the number of function evaluations. This provably reduces the mean squared error. We empirically demonstrate that this leads to variance reduction, faster convergence, and generally improved performance in two unsupervised latent variable models.",
+    "title": "Rao-Blackwellizing the Straight-Through Gumbel-Softmax Gradient Estimator",
+    "authors": [
+      "Max B Paulus",
+      "Chris J. Maddison",
+      "Andreas Krause"
+    ],
+    "emails": [
+      "~Andreas_Krause1",
+      "~Max_B_Paulus1",
+      "~Chris_J._Maddison1"
+    ],
+    "rank": 79
+  },
+  {
+    "url": "https://openreview.net/forum?id=D9I3drBz4UC",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.29",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Natural data are often long-tail distributed over semantic classes. Existing recognition methods tend to focus on gaining performance on tail classes, often at the expense of losing performance on head classes and with increased classifier variance. The low tail performance manifests itself in large inter-class confusion and high classifier variance. We aim to reduce both the bias and the variance of a long-tailed classifier by RoutIng Diverse Experts (RIDE), consisting of three components: 1) a shared architecture for multiple classifiers (experts); 2) a distribution-aware diversity loss that encourages more diverse decisions for classes with fewer training instances; and 3) an expert routing module that dynamically assigns more ambiguous instances to additional experts.  With on-par computational complexity, RIDE significantly outperforms the state-of-the-art methods by 5% to 7% on all the benchmarks including CIFAR100-LT, ImageNet-LT, and iNaturalist 2018. RIDE is also a universal framework that can be applied to different backbone networks and integrated into various long-tailed algorithms and training mechanisms for consistent performance gains. Our code is publicly available at <a href=\"https://github.com/frank-xwang/RIDE-LongTailRecognition\" target=\"_blank\" rel=\"nofollow\">https://github.com/frank-xwang/RIDE-LongTailRecognition</a>.",
+    "title": "Long-tailed Recognition by Routing Diverse Distribution-Aware Experts",
+    "authors": [
+      "Xudong Wang",
+      "Long Lian",
+      "Zhongqi Miao",
+      "Ziwei Liu",
+      "Stella Yu"
+    ],
+    "emails": [
+      "~Ziwei_Liu1",
+      "~Long_Lian1",
+      "~Stella_Yu2",
+      "~Zhongqi_Miao1",
+      "~Xudong_Wang4"
+    ],
+    "rank": 80
+  },
+  {
+    "url": "https://openreview.net/forum?id=OmtmcPkkhT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      8,
+      6,
+      6
+    ],
+    "rating": "7.29",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Although deep networks are typically used to approximate functions over high dimensional inputs, recent work has increased interest in neural networks as function approximators for low-dimensional-but-complex functions, such as representing images as a function of pixel coordinates, solving differential equations, or representing signed distance fields or neural radiance fields.  Key to these recent successes has been the use of new elements such as sinusoidal nonlinearities, or Fourier features in positional encodings, which vastly outperform simple ReLU networks.  In this paper, we propose and empirically demonstrate that an arguably simpler class of function approximators can work just as well for such problems: multiplicative filter networks.  In these networks, we avoid traditional compositional depth altogether, and simply multiply together (linear functions of) sinusoidal or Gabor wavelet functions applied to the input.  This representation has the notable advantage that the entire function can simply be viewed as a linear function approximator over an exponential number of Fourier or Gabor basis functions, respectively.  Despite this simplicity, when compared to recent approaches that use Fourier features with ReLU networks or sinusoidal activation networks, we show that these multiplicative filter networks largely outperform or match the performance of these recent approaches on the domains highlighted in these past works.",
+    "title": "Multiplicative Filter Networks",
+    "authors": [
+      "Rizal Fathony",
+      "Anit Kumar Sahu",
+      "Devin Willmott",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Anit_Kumar_Sahu1",
+      "~Rizal_Fathony1",
+      "~J_Zico_Kolter1",
+      "~Devin_Willmott1"
+    ],
+    "rank": 81
+  },
+  {
+    "url": "https://openreview.net/forum?id=1YLJDvSx6J4",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      10,
+      7
+    ],
+    "rating": "7.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Learning on 3D structures of large biomolecules is emerging as a distinct area in machine learning, but there has yet to emerge a unifying network architecture that simultaneously leverages the geometric and relational aspects of the problem domain. To address this gap, we introduce geometric vector perceptrons, which extend standard dense layers to operate on collections of Euclidean vectors. Graph neural networks equipped with such layers are able to perform both geometric and relational reasoning on efficient representations of macromolecules. We demonstrate our approach on two important problems in learning from protein structure: model quality assessment and computational protein design. Our approach improves over existing classes of architectures on both problems, including state-of-the-art convolutional neural networks and graph neural networks. We release our code at <a href=\"https://github.com/drorlab/gvp\" target=\"_blank\" rel=\"nofollow\">https://github.com/drorlab/gvp</a>.",
+    "title": "Learning from Protein Structure with Geometric Vector Perceptrons",
+    "authors": [
+      "Bowen Jing",
+      "Stephan Eismann",
+      "Patricia Suriana",
+      "Raphael John Lamarre Townshend",
+      "Ron Dror"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Stephan_Eismann1",
+      "~Bowen_Jing1",
+      "~Ron_Dror1",
+      "~Raphael_John_Lamarre_Townshend1"
+    ],
+    "rank": 82
+  },
+  {
+    "url": "https://openreview.net/forum?id=wWK7yXkULyh",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent advances by practitioners in the deep learning community have breathed new life into Locality Sensitive Hashing (LSH), using it to reduce memory and time bottlenecks in neural network (NN) training. However, while LSH has sub-linear guarantees for approximate near-neighbor search in theory, it is known to have inefficient query time in practice due to its use of random hash functions. Moreover, when model parameters are changing, LSH suffers from update overhead. This work is motivated by an observation that model parameters evolve slowly, such that the changes do not always require an LSH update to maintain performance. This phenomenon points to the potential for a reduction in update time and allows for a modified learnable version of data-dependent LSH to improve query time at a low cost. We use the above insights to build MONGOOSE, an end-to-end LSH framework for efficient NN training. In particular, MONGOOSE is equipped with a scheduling algorithm to adaptively perform LSH updates with provable guarantees and learnable hash functions to improve query efficiency. Empirically, we validate MONGOOSE on large-scale deep learning models for recommendation systems and language modeling. We find that it achieves up to 8% better accuracy compared to previous LSH approaches, with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>6.5</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> speed-up and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>6</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> reduction in memory usage.",
+    "title": "MONGOOSE: A Learnable LSH Framework for Efficient Neural Network Training",
+    "authors": [
+      "Beidi Chen",
+      "Zichang Liu",
+      "Binghui Peng",
+      "Zhaozhuo Xu",
+      "Jonathan Lingjie Li",
+      "Tri Dao",
+      "Zhao Song",
+      "Anshumali Shrivastava",
+      "Christopher Re"
+    ],
+    "emails": [
+      "~Binghui_Peng1",
+      "~Beidi_Chen1",
+      "~Zhaozhuo_Xu1",
+      "~Jonathan_Lingjie_Li1",
+      "~Tri_Dao1",
+      "~Christopher_Re1",
+      "~Anshumali_Shrivastava1",
+      "~Zhao_Song3",
+      "~Zichang_Liu1"
+    ],
+    "rank": 83
+  },
+  {
+    "url": "https://openreview.net/forum?id=wb3wxCObbRT",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We develop an approach to growing deep network architectures over the course of training, driven by a principled combination of accuracy and sparsity objectives.  Unlike existing pruning or architecture search techniques that operate on full-sized models or supernet architectures, our method can start from a small, simple seed architecture and dynamically grow and prune both layers and filters.  By combining a continuous relaxation of discrete network structure optimization with a scheme for sampling sparse subnetworks, we produce compact, pruned networks, while also drastically reducing the computational expense of training.  For example, we achieve <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>49.7</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> inference FLOPs and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>47.4</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> training FLOPs savings compared to a baseline ResNet-50 on ImageNet, while maintaining <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>75.2</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> top-1 validation accuracy --- all without any dedicated fine-tuning stage.  Experiments across CIFAR, ImageNet, PASCAL VOC, and Penn Treebank, with convolutional networks for image classification and semantic segmentation, and recurrent networks for language modeling, demonstrate that we both train faster and produce more efficient networks than competing architecture pruning or search methods.",
+    "title": "Growing Efficient Deep Networks by Structured Continuous Sparsification",
+    "authors": [
+      "Xin Yuan",
+      "Pedro Henrique Pamplona Savarese",
+      "Michael Maire"
+    ],
+    "emails": [
+      "~Michael_Maire1",
+      "~Pedro_Henrique_Pamplona_Savarese1",
+      "~Xin_Yuan5"
+    ],
+    "rank": 84
+  },
+  {
+    "url": "https://openreview.net/forum?id=iAmZUo0DxC0",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      8,
+      7
+    ],
+    "rating": "7.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The volume of \"free\" data on the internet has been key to the current success of deep learning. However, it also raises privacy concerns about the unauthorized exploitation of personal data for training commercial models. It is thus crucial to develop methods to prevent unauthorized data exploitation. This paper raises the question: can data be made unlearnable for deep learning models? We present a type of error-minimizing noise that can indeed make training examples unlearnable. Error-minimizing noise is intentionally generated to reduce the error of one or more of the training example(s) close to zero, which can trick the model into believing there is \"nothing\" to learn from these example(s). The noise is restricted to be imperceptible to human eyes, and thus does not affect normal data utility. We empirically verify the effectiveness of error-minimizing noise in both sample-wise and class-wise forms. We also demonstrate its flexibility under extensive experimental settings and practicability in a case study of face recognition. Our work establishes an important \ufb01rst step towards making personal data unexploitable to deep learning models.",
+    "title": "Unlearnable Examples: Making Personal Data Unexploitable",
+    "authors": [
+      "Hanxun Huang",
+      "Xingjun Ma",
+      "Sarah Monazam Erfani",
+      "James Bailey",
+      "Yisen Wang"
+    ],
+    "emails": [
+      "~Hanxun_Huang1",
+      "~James_Bailey1",
+      "~Xingjun_Ma1",
+      "~Sarah_Monazam_Erfani1",
+      "~Yisen_Wang1"
+    ],
+    "rank": 85
+  },
+  {
+    "url": "https://openreview.net/forum?id=yUxUNaj2Sl",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      9,
+      6
+    ],
+    "rating": "7.27",
+    "confidences": [
+      2,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Convolutional neural networks (CNNs) learn to extract representations of complex features, such as object shapes and textures to solve image recognition tasks. Recent work indicates that CNNs trained on ImageNet are biased towards features that encode textures and that these alone are sufficient to generalize to unseen test data from the same distribution as the training data but often fail to generalize to out-of-distribution data. It has been shown that augmenting the training data with different image styles decreases this texture bias in favor of increased shape bias while at the same time improving robustness to common corruptions, such as noise and blur. Commonly, this is interpreted as shape bias increasing corruption robustness. However, this relationship is only hypothesized. We perform a systematic study of different ways of composing inputs based on natural images, explicit edge information, and stylization. While stylization is essential for achieving high corruption robustness, we do not find a clear correlation between shape bias and robustness. We conclude that the data augmentation caused by style-variation  accounts for the improved corruption robustness and increased shape bias is only a byproduct.",
+    "title": "Does enhanced shape bias improve neural network robustness to common corruptions?",
+    "authors": [
+      "Chaithanya Kumar Mummadi",
+      "Ranjitha Subramaniam",
+      "Robin Hutmacher",
+      "Julien Vitay",
+      "Volker Fischer",
+      "Jan Hendrik Metzen"
+    ],
+    "emails": [
+      "~Jan_Hendrik_Metzen1",
+      "~Chaithanya_Kumar_Mummadi1",
+      "~Robin_Hutmacher1",
+      "~Ranjitha_Subramaniam1",
+      "~Volker_Fischer1",
+      "~Julien_Vitay1"
+    ],
+    "rank": 86
+  },
+  {
+    "url": "https://openreview.net/forum?id=Pbj8H_jEHYv",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Recent work has highlighted several advantages of enforcing orthogonality in the weight layers of deep networks, such as maintaining the stability of activations, preserving gradient norms, and enhancing adversarial robustness by enforcing low Lipschitz constants. Although numerous methods exist for enforcing the orthogonality of fully-connected layers, those for convolutional layers are more heuristic in nature, often focusing on penalty methods or limited classes of convolutions. In this work, we propose and evaluate an alternative approach to directly parameterize convolutional layers that are constrained to be orthogonal. Specifically, we propose to apply the Cayley transform to a skew-symmetric convolution in the Fourier domain, so that the inverse convolution needed by the Cayley transform can be computed efficiently. We compare our method to previous Lipschitz-constrained and orthogonal convolutional layers and show that it indeed preserves orthogonality to a high degree even for large convolutions. Applied to the problem of certified adversarial robustness, we show that networks incorporating the layer outperform existing deterministic methods for certified defense against <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-norm-bounded adversaries, while scaling to larger architectures than previously investigated. Code is available at <a href=\"https://github.com/locuslab/orthogonal-convolutions\" target=\"_blank\" rel=\"nofollow\">https://github.com/locuslab/orthogonal-convolutions</a>.",
+    "title": "Orthogonalizing Convolutional Layers with the Cayley Transform",
+    "authors": [
+      "Asher Trockman",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Asher_Trockman1",
+      "~J_Zico_Kolter1"
+    ],
+    "rank": 87
+  },
+  {
+    "url": "https://openreview.net/forum?id=B7v4QMR6Z9w",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.27",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We propose a novel federated learning method for distributively training neural network models, where the server orchestrates cooperation between a subset of randomly chosen devices in each round. We view Federated Learning problem primarily from a communication perspective and allow more device level computations to save transmission costs. We point out a fundamental dilemma, in that the minima of the local-device level empirical loss are inconsistent with those of the global empirical loss. Different from recent prior works, that either attempt inexact minimization or utilize devices for parallelizing gradient computation, we propose a dynamic regularizer for each device at each round, so that in the limit the global and device solutions are aligned. We demonstrate both through empirical results on real and synthetic data as well as analytical results that our scheme leads to efficient training, in both convex and non-convex settings, while being fully agnostic to device heterogeneity and robust to large number of devices, partial participation and unbalanced data.",
+    "title": "Federated Learning Based on Dynamic Regularization",
+    "authors": [
+      "Durmus Alp Emre Acar",
+      "Yue Zhao",
+      "Ramon Matas",
+      "Matthew Mattina",
+      "Paul Whatmough",
+      "Venkatesh Saligrama"
+    ],
+    "emails": [
+      "~Venkatesh_Saligrama1",
+      "~Matthew_Mattina1",
+      "~Yue_Zhao12",
+      "~Durmus_Alp_Emre_Acar1",
+      "~Paul_Whatmough1",
+      "~Ramon_Matas1"
+    ],
+    "rank": 88
+  },
+  {
+    "url": "https://openreview.net/forum?id=O-XJwyoIF-k",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The universal approximation property of width-bounded networks has been studied as a dual of classical universal approximation results on depth-bounded networks. However, the critical width enabling the universal approximation has not been exactly characterized in terms of the input dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>d</mi><mi>x</mi></msub></math></mjx-assistive-mml></mjx-container> and the output dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>d</mi><mi>y</mi></msub></math></mjx-assistive-mml></mjx-container>. In this work, we provide the first definitive result in this direction for networks using the ReLU activation functions: The minimum width required for the universal approximation of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mi>p</mi></msup></math></mjx-assistive-mml></mjx-container> functions is exactly <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c><mjx-c class=\"mjx-c78\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo data-mjx-texclass=\"OP\" movablelimits=\"true\">max</mo><mo fence=\"false\" stretchy=\"false\">{</mo><msub><mi>d</mi><mi>x</mi></msub><mo>+</mo><mn>1</mn><mo>,</mo><msub><mi>d</mi><mi>y</mi></msub><mo fence=\"false\" stretchy=\"false\">}</mo></math></mjx-assistive-mml></mjx-container>. We also prove that the same conclusion does not hold for the uniform approximation with ReLU, but does hold with an additional threshold activation function. Our proof technique can be also used to derive a tighter upper bound on the minimum width required for the universal approximation using networks with general activation functions.",
+    "title": "Minimum Width for Universal Approximation",
+    "authors": [
+      "Sejun Park",
+      "Chulhee Yun",
+      "Jaeho Lee",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Sejun_Park1",
+      "~Jaeho_Lee3",
+      "~Chulhee_Yun1",
+      "~Jinwoo_Shin1"
+    ],
+    "rank": 89
+  },
+  {
+    "url": "https://openreview.net/forum?id=uCY5MuAxcxU",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Convolutional neural networks often dominate fully-connected counterparts in generalization performance, especially on image classification tasks. This is often explained in terms of \\textquotedblleft better inductive bias.\\textquotedblright\\  However, this has not been made mathematically rigorous, and the hurdle is that the sufficiently wide fully-connected net can always simulate the convolutional net. Thus the training algorithm plays a role. The current work describes a natural task on which a provable sample complexity gap can be shown, for standard training algorithms. We construct a single natural distribution on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup><mo>\u00d7</mo><mo fence=\"false\" stretchy=\"false\">{</mo><mo>\u00b1</mo><mn>1</mn><mo fence=\"false\" stretchy=\"false\">}</mo></math></mjx-assistive-mml></mjx-container> on which any orthogonal-invariant algorithm (i.e. fully-connected networks trained with most gradient-based methods from gaussian initialization) requires <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><msup><mi>d</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> samples to generalize while <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> samples suffice for convolutional architectures. Furthermore, we demonstrate a single target function, learning which on all possible distributions leads to an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> vs <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><msup><mi>d</mi><mn>2</mn></msup><mrow><mo>/</mo></mrow><mi>\u03b5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> gap. The proof relies on the fact that SGD on fully-connected network is orthogonal equivariant. Similar results are achieved for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> regression and adaptive training algorithms, e.g. Adam and AdaGrad, which are only permutation equivariant.",
+    "title": "Why Are Convolutional Nets More Sample-Efficient than Fully-Connected Nets?",
+    "authors": [
+      "Zhiyuan Li",
+      "Yi Zhang",
+      "Sanjeev Arora"
+    ],
+    "emails": [
+      "~Sanjeev_Arora1",
+      "~Zhiyuan_Li2",
+      "~Yi_Zhang1"
+    ],
+    "rank": 90
+  },
+  {
+    "url": "https://openreview.net/forum?id=37nvvqkCo5",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      8,
+      7,
+      6
+    ],
+    "rating": "7.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Real-world classification problems typically exhibit an imbalanced or long-tailed label distribution, wherein many labels have only a few associated samples. This poses a challenge for generalisation on such labels, and also  makes naive learning biased towards dominant labels. In this paper,  we present a statistical framework that unifies and generalises several recent proposals to cope with these challenges. Our framework revisits the classic idea of logit adjustment based on the label frequencies, which encourages a large relative margin between logits of rare positive versus dominant negative labels. This yields two techniques  for long-tail learning, where such adjustment is either applied post-hoc to a trained model, or enforced in the loss during training. These techniques are statistically grounded, and practically effective on four real-world datasets with long-tailed label distributions. ",
+    "title": "Long-tail learning via logit adjustment",
+    "authors": [
+      "Aditya Krishna Menon",
+      "Sadeep Jayasumana",
+      "Ankit Singh Rawat",
+      "Himanshu Jain",
+      "Andreas Veit",
+      "Sanjiv Kumar"
+    ],
+    "emails": [
+      "~Ankit_Singh_Rawat1",
+      "~Andreas_Veit1",
+      "google.com",
+      "~Sadeep_Jayasumana1",
+      "~Sanjiv_Kumar1",
+      "~Aditya_Krishna_Menon1"
+    ],
+    "rank": 91
+  },
+  {
+    "url": "https://openreview.net/forum?id=giit4HdDNa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      7
+    ],
+    "rating": "7.27",
+    "confidences": [
+      2,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Despite their elegant formulation and lightweight memory cost, neural ordinary differential equations (NODEs) suffer from known representational limitations. In particular, the single flow learned by NODEs cannot express all homeomorphisms from a given data space to itself, and their static weight parameterization restricts the type of functions they can learn compared to discrete architectures with layer-dependent weights. Here, we describe a new module called neurally-controlled ODE (N-CODE) designed to improve the expressivity of NODEs. The parameters of N-CODE modules are dynamic variables governed by a trainable map from initial or current activation state, resulting in forms of open-loop and closed-loop control, respectively. A single module is sufficient for learning a distribution on non-autonomous flows that adaptively drive neural representations. We provide theoretical and empirical evidence that N-CODE circumvents limitations of previous NODEs models and show how increased model expressivity manifests in several supervised and unsupervised learning problems. These favorable empirical results indicate the potential of using data- and activity-dependent plasticity in neural networks across numerous domains.",
+    "title": "Go with the flow: Adaptive control for Neural ODEs",
+    "authors": [
+      "Mathieu Chalvidal",
+      "Matthew Ricci",
+      "Rufin VanRullen",
+      "Thomas Serre"
+    ],
+    "emails": [
+      "~Thomas_Serre1",
+      "~Matthew_Ricci1",
+      "cnrs.fr",
+      "~Mathieu_Chalvidal1"
+    ],
+    "rank": 92
+  },
+  {
+    "url": "https://openreview.net/forum?id=cPZOyoDloxl",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Every living organism struggles against disruptive environmental forces to carve out and maintain an orderly niche. We propose that such a struggle to achieve and preserve order might offer a principle for the emergence of useful behaviors in artificial agents. We formalize this idea into an unsupervised reinforcement learning method called surprise minimizing reinforcement learning (SMiRL). SMiRL alternates between learning a density model to evaluate the surprise of a stimulus, and improving the policy to seek more predictable stimuli. The policy seeks out stable and repeatable situations that counteract the environment's prevailing sources of entropy. This might include avoiding other hostile agents, or finding a stable, balanced pose for a bipedal robot in the face of disturbance forces. We demonstrate that our surprise minimizing agents can successfully play Tetris, Doom, control a humanoid to avoid falls, and navigate to escape enemies in a maze without any task-specific reward supervision. We further show that SMiRL can be used together with standard task rewards to accelerate reward-driven learning.",
+    "title": "SMiRL: Surprise Minimizing Reinforcement Learning in Unstable Environments",
+    "authors": [
+      "Glen Berseth",
+      "Daniel Geng",
+      "Coline Manon Devin",
+      "Nicholas Rhinehart",
+      "Chelsea Finn",
+      "Dinesh Jayaraman",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Coline_Manon_Devin1",
+      "~Chelsea_Finn1",
+      "~Glen_Berseth1",
+      "~Dinesh_Jayaraman2",
+      "~Nicholas_Rhinehart1",
+      "berkeley.edu",
+      "~Sergey_Levine1"
+    ],
+    "rank": 93
+  },
+  {
+    "url": "https://openreview.net/forum?id=uXl3bZLkr3c",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      8
+    ],
+    "rating": "7.25",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "A model must adapt itself to generalize to new and different data during testing. In this setting of fully test-time adaptation the model has only the test data and its own parameters. We propose to adapt by test entropy minimization (tent): we optimize the model for confidence as measured by the entropy of its predictions. Our method estimates normalization statistics and optimizes channel-wise affine transformations to update online on each batch. Tent reduces generalization error for image classification on corrupted ImageNet and CIFAR-10/100 and reaches a new state-of-the-art error on ImageNet-C. Tent handles source-free domain adaptation on digit recognition from SVHN to MNIST/MNIST-M/USPS, on semantic segmentation from GTA to Cityscapes, and on the VisDA-C benchmark. These results are achieved in one epoch of test-time optimization without altering training.",
+    "title": "Tent: Fully Test-Time Adaptation by Entropy Minimization",
+    "authors": [
+      "Dequan Wang",
+      "Evan Shelhamer",
+      "Shaoteng Liu",
+      "Bruno Olshausen",
+      "Trevor Darrell"
+    ],
+    "emails": [
+      "~Dequan_Wang1",
+      "~Shaoteng_Liu1",
+      "~Evan_Shelhamer2",
+      "~Bruno_Olshausen1",
+      "~Trevor_Darrell2"
+    ],
+    "rank": 94
+  },
+  {
+    "url": "https://openreview.net/forum?id=p-NZIuwqhI4",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "A deep equilibrium model uses implicit layers, which are implicitly defined through an equilibrium point of an infinite sequence of computation. It avoids any explicit computation of the infinite sequence by finding an equilibrium point directly via root-finding and by computing gradients via implicit differentiation. In this paper, we analyze the gradient dynamics of deep equilibrium models with nonlinearity only on weight matrices and non-convex objective functions of weights for regression and classification. Despite non-convexity, convergence to global optimum at a linear rate is guaranteed without any assumption on the width of the models, allowing the width to be smaller than the output dimension and the number of data points. Moreover, we prove a relation between the gradient dynamics of the deep implicit layer and the dynamics of trust region Newton method of a shallow explicit layer. This mathematically proven relation along with our numerical observation suggests the importance of understanding implicit bias of implicit layers and an open problem on the topic. Our proofs deal with implicit layers, weight tying and nonlinearity on weights, and differ from those in the related literature.\n\n",
+    "title": "On the Theory of Implicit Deep Learning: Global Convergence with Implicit Layers",
+    "authors": [
+      "Kenji Kawaguchi"
+    ],
+    "emails": [
+      "~Kenji_Kawaguchi1"
+    ],
+    "rank": 95
+  },
+  {
+    "url": "https://openreview.net/forum?id=3Aoft6NWFej",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      7,
+      8
+    ],
+    "rating": "7.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Masking tokens uniformly at random constitutes a common flaw in the pretraining of Masked Language Models (MLMs) such as BERT. We show that such uniform masking allows an MLM to minimize its training objective by latching onto shallow local signals, leading to pretraining inefficiency and suboptimal downstream performance. To address this flaw, we propose PMI-Masking, a principled masking strategy based on the concept of Pointwise Mutual Information (PMI), which jointly masks a token n-gram if it exhibits high collocation over the corpus. PMI-Masking motivates, unifies, and improves upon prior more heuristic approaches that attempt to address the drawback of random uniform token masking, such as whole-word masking, entity/phrase masking, and random-span masking. Specifically, we show experimentally that PMI-Masking reaches the performance of prior masking approaches in half the training time, and consistently improves performance at the end of pretraining.",
+    "title": "PMI-Masking: Principled masking of correlated spans",
+    "authors": [
+      "Yoav Levine",
+      "Barak Lenz",
+      "Opher Lieber",
+      "Omri Abend",
+      "Kevin Leyton-Brown",
+      "Moshe Tennenholtz",
+      "Yoav Shoham"
+    ],
+    "emails": [
+      "~Omri_Abend1",
+      "~Yoav_Shoham1",
+      "~Yoav_Levine1",
+      "ai21.com",
+      "~Moshe_Tennenholtz1",
+      "~Kevin_Leyton-Brown1"
+    ],
+    "rank": 96
+  },
+  {
+    "url": "https://openreview.net/forum?id=zQTezqCCtNx",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.24",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The study of adversarial examples and their activations have attracted significant attention for secure and robust learning with deep neural networks (DNNs).  Different from existing works, in this paper, we highlight two new characteristics of adversarial examples from the channel-wise activation perspective:  1) the activation magnitudes of adversarial examples are higher than that of natural examples; and 2) the channels are activated more uniformly by adversarial examples than natural examples. We find that, while the state-of-the-art defense adversarial training has addressed the first issue of high activation magnitude via training on adversarial examples, the second issue of uniform activation remains.  This motivates us to suppress redundant activations from being activated by adversarial perturbations during the adversarial training process, via a Channel-wise Activation Suppressing (CAS) training strategy.  We show that CAS can train a model that inherently suppresses adversarial activations, and can be easily applied to existing defense methods to further improve their robustness. Our work provides a simplebut generic training strategy for robustifying the intermediate layer activations of DNNs.",
+    "title": "Improving Adversarial Robustness via Channel-wise Activation Suppressing",
+    "authors": [
+      "Yang Bai",
+      "Yuyuan Zeng",
+      "Yong Jiang",
+      "Shu-Tao Xia",
+      "Xingjun Ma",
+      "Yisen Wang"
+    ],
+    "emails": [
+      "~Yang_Bai1",
+      "~Yong_Jiang3",
+      "~Yuyuan_Zeng1",
+      "~Xingjun_Ma1",
+      "~Shu-Tao_Xia1",
+      "~Yisen_Wang1"
+    ],
+    "rank": 97
+  },
+  {
+    "url": "https://openreview.net/forum?id=rJA5Pz7lHKb",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "While autoregressive models excel at image compression, their sample quality is often lacking. Although not realistic, generated images often have high likelihood according to the model, resembling the case of adversarial examples. Inspired by a successful adversarial defense method, we incorporate randomized smoothing into autoregressive generative modeling. We first model a smoothed version of the data distribution, and then reverse the smoothing process to recover the original data distribution. This procedure drastically improves the sample quality of existing autoregressive models on several synthetic and real-world image datasets while obtaining competitive likelihoods on synthetic datasets.",
+    "title": "Improved Autoregressive Modeling with Distribution Smoothing",
+    "authors": [
+      "Chenlin Meng",
+      "Jiaming Song",
+      "Yang Song",
+      "Shengjia Zhao",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Yang_Song1",
+      "~Jiaming_Song1",
+      "~Chenlin_Meng1",
+      "~Stefano_Ermon1",
+      "~Shengjia_Zhao1"
+    ],
+    "rank": 98
+  },
+  {
+    "url": "https://openreview.net/forum?id=rq_Qr0c1Hyo",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.23",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "For infinitesimal learning rates, stochastic gradient descent (SGD) follows the path of gradient flow on the full batch loss function. However moderately large learning rates can achieve higher test accuracies, and this generalization benefit is not explained by convergence bounds, since the learning rate which maximizes test accuracy is often larger than the learning rate which minimizes training loss. To interpret this phenomenon we prove that for SGD with random shuffling, the mean SGD iterate also stays close to the path of gradient flow if the learning rate is small and finite, but on a modified loss. This modified loss is composed of the original loss function and an implicit regularizer, which penalizes the norms of the minibatch gradients. Under mild assumptions, when the batch size is small the scale of the implicit regularization term is proportional to the ratio of the learning rate to the batch size. We verify empirically that explicitly including the implicit regularizer in the loss can enhance the test accuracy when the learning rate is small.",
+    "title": "On the Origin of Implicit Regularization in Stochastic Gradient Descent",
+    "authors": [
+      "Samuel L Smith",
+      "Benoit Dherin",
+      "David Barrett",
+      "Soham De"
+    ],
+    "emails": [
+      "~David_Barrett1",
+      "~Samuel_L_Smith1",
+      "~Benoit_Dherin1",
+      "~Soham_De2"
+    ],
+    "rank": 99
+  },
+  {
+    "url": "https://openreview.net/forum?id=6isfR3JCbi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6
+    ],
+    "rating": "7.22",
+    "confidences": [
+      4,
+      3,
+      2
+    ],
+    "abstract": "Differentially private GANs have proven to be a promising approach for generating realistic synthetic data without compromising the privacy of individuals. Due to the privacy-protective noise introduced in the  training, the convergence of GANs becomes even more elusive, which often leads to poor utility in the output generator at the end of training. We propose Private post-GAN boosting (Private PGB), a differentially private method that combines samples produced by the sequence of generators obtained during GAN training to create a high-quality synthetic dataset. To that end, our method leverages the Private Multiplicative Weights method (Hardt and Rothblum, 2010) to reweight generated samples. We evaluate Private PGB on two dimensional toy data, MNIST images, US Census data and a standard machine learning prediction task. Our experiments show that Private PGB improves upon a standard private GAN approach across a collection of quality measures. We also provide a non-private variant of PGB that improves the data quality of standard GAN training.",
+    "title": "Private Post-GAN Boosting",
+    "authors": [
+      "Marcel Neunhoeffer",
+      "Steven Wu",
+      "Cynthia Dwork"
+    ],
+    "emails": [
+      "~Marcel_Neunhoeffer1",
+      "~Steven_Wu1",
+      "~Cynthia_Dwork2"
+    ],
+    "rank": 100
+  },
+  {
+    "url": "https://openreview.net/forum?id=yHeg4PbFHh",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      9,
+      5
+    ],
+    "rating": "7.20",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Program synthesis is challenging largely because of the difficulty of search in a large space of programs. Human programmers routinely tackle the task of writing complex programs by writing sub-programs and then analyzing their intermediate results to compose them in appropriate ways. Motivated by this intuition, we present a new synthesis approach that leverages learning to guide a bottom-up search over programs. In particular, we train a model to prioritize compositions of intermediate values during search conditioned on a given set of input-output examples. This is a powerful combination because of several emergent properties. First, in bottom-up search, intermediate programs can be executed, providing semantic information to the neural network. Second, given the concrete values from those executions, we can exploit rich features based on recent work on property signatures. Finally, bottom-up search allows the system substantial flexibility in what order to generate the solution, allowing the synthesizer to build up a program from multiple smaller sub-programs. Overall, our empirical evaluation finds that the combination of learning and bottom-up search is remarkably effective, even with simple supervised learning approaches. We demonstrate the effectiveness of our technique on two datasets, one from the SyGuS competition and one of our own creation.",
+    "title": "BUSTLE: Bottom-Up Program Synthesis Through Learning-Guided Exploration",
+    "authors": [
+      "Augustus Odena",
+      "Kensen Shi",
+      "David Bieber",
+      "Rishabh Singh",
+      "Charles Sutton",
+      "Hanjun Dai"
+    ],
+    "emails": [
+      "~David_Bieber1",
+      "~Charles_Sutton1",
+      "~Rishabh_Singh1",
+      "~Kensen_Shi1",
+      "~Augustus_Odena1",
+      "~Hanjun_Dai1"
+    ],
+    "rank": 101
+  },
+  {
+    "url": "https://openreview.net/forum?id=43VKWxg_Sqr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      6,
+      6
+    ],
+    "rating": "7.20",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We present an unsupervised approach that converts the input speech of any individual into audiovisual streams of potentially-infinitely many output speakers. Our approach builds on simple autoencoders that project out-of-sample data onto the distribution of the training set. We use exemplar autoencoders to learn the voice, stylistic prosody, and visual appearance of a specific target exemplar speech. In contrast to existing methods, the proposed approach can be easily extended to an arbitrarily large number of speakers and styles using only 3 minutes of target audio-video data, without requiring any training data for the input speaker. To do so, we learn audiovisual bottleneck representations that capture the structured linguistic content of speech. We outperform prior approaches on both audio and video synthesis.\n",
+    "title": "Unsupervised Audiovisual Synthesis via Exemplar Autoencoders",
+    "authors": [
+      "Kangle Deng",
+      "Aayush Bansal",
+      "Deva Ramanan"
+    ],
+    "emails": [
+      "~Deva_Ramanan1",
+      "~Aayush_Bansal1",
+      "~Kangle_Deng1"
+    ],
+    "rank": 102
+  },
+  {
+    "url": "https://openreview.net/forum?id=6NFBvWlRXaG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      8
+    ],
+    "rating": "7.20",
+    "confidences": [
+      3,
+      2,
+      2,
+      3
+    ],
+    "abstract": "Learning functions on point clouds has applications in many fields, including computer vision, computer graphics, physics, and chemistry. Recently, there has been a growing interest in neural architectures that are invariant or equivariant to all three shape-preserving transformations of point clouds: translation, rotation, and permutation. In this paper, we present a first study of the approximation power of these architectures. We first derive two sufficient conditions for an equivariant architecture to have the universal approximation property, based on a novel characterization of the space of equivariant polynomials. We then use these conditions to show that two recently suggested models, Tensor field Networks and SE3-Transformers, are universal, and for devising two other novel universal architectures.",
+    "title": "On the Universality of Rotation Equivariant Point Cloud Networks",
+    "authors": [
+      "Nadav Dym",
+      "Haggai Maron"
+    ],
+    "emails": [
+      "~Haggai_Maron1",
+      "gmail.com"
+    ],
+    "rank": 103
+  },
+  {
+    "url": "https://openreview.net/forum?id=zeFrfgyZln",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      9,
+      7,
+      6
+    ],
+    "rating": "7.20",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Conducting text retrieval in a learned dense representation space has many intriguing advantages. Yet dense retrieval (DR) often underperforms word-based sparse retrieval. In this paper, we first theoretically show the bottleneck of dense retrieval is the domination of uninformative negatives sampled in mini-batch training, which yield diminishing gradient norms, large gradient variances, and slow convergence. We then propose Approximate nearest neighbor Negative Contrastive Learning (ANCE), which selects hard training negatives globally from the entire corpus. Our experiments demonstrate the effectiveness of ANCE on web search, question answering, and in a commercial search engine, showing ANCE dot-product retrieval nearly matches the accuracy of BERT-based cascade IR pipeline. We also empirically validate our theory that negative sampling with ANCE better approximates the oracle importance sampling procedure and improves learning convergence.",
+    "title": "Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval",
+    "authors": [
+      "Lee Xiong",
+      "Chenyan Xiong",
+      "Ye Li",
+      "Kwok-Fung Tang",
+      "Jialin Liu",
+      "Paul N. Bennett",
+      "Junaid Ahmed",
+      "Arnold Overwijk"
+    ],
+    "emails": [
+      "~Paul_N._Bennett1",
+      "~Lee_Xiong1",
+      "microsoft.com",
+      "~Chenyan_Xiong1"
+    ],
+    "rank": 104
+  },
+  {
+    "url": "https://openreview.net/forum?id=wS0UFjsNYjn",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      8,
+      7
+    ],
+    "rating": "7.19",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Unsupervised learning aims to learn meaningful representations from unlabeled data which can captures its intrinsic structure, that can be transferred to downstream tasks. Meta-learning, whose objective is to learn to generalize across tasks such that the learned model can rapidly adapt to a novel task, shares the spirit of unsupervised learning in that the both seek to learn more effective and efficient learning procedure than learning from scratch. The fundamental difference of the two is that the most meta-learning approaches are supervised, assuming full access to the labels. However, acquiring labeled dataset for meta-training not only is costly as it requires human efforts in labeling but also limits its applications to pre-defined task distributions. In this paper, we propose a principled unsupervised meta-learning model, namely Meta-GMVAE, based on Variational Autoencoder (VAE) and set-level variational inference. Moreover, we introduce a mixture of Gaussian (GMM) prior, assuming that each modality represents each class-concept in a randomly sampled episode, which we optimize with Expectation-Maximization (EM). Then, the learned model can be used for downstream few-shot classification tasks, where we obtain task-specific parameters by performing semi-supervised EM on the latent representations of the support and query set, and predict labels of the query set by computing aggregated posteriors. We validate our model on Omniglot and Mini-ImageNet datasets by evaluating its performance on downstream few-shot classification tasks. The results show that our model obtain impressive performance gains over existing unsupervised meta-learning baselines, even outperforming supervised MAML on a certain setting.",
+    "title": "Meta-GMVAE: Mixture of Gaussian VAE for Unsupervised Meta-Learning",
+    "authors": [
+      "Dong Bok Lee",
+      "Dongchan Min",
+      "Seanie Lee",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Dong_Bok_Lee1",
+      "~Dongchan_Min1",
+      "~Seanie_Lee1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 105
+  },
+  {
+    "url": "https://openreview.net/forum?id=Tp7kI90Htd",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      8,
+      6,
+      7
+    ],
+    "rating": "7.19",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Deep neural networks (DNN) have set new standards at predicting responses of neural populations to visual input.  Most such DNNs consist of a convolutional network (core) shared across all neurons which learns a representation of neural computation in visual cortex and a neuron-specific readout that linearly combines the relevant features in this representation. The goal of this paper is to test whether such a representation is indeed generally characteristic for visual cortex, i.e. generalizes between animals of a species, and what factors contribute to obtaining such a generalizing core. To push all non-linear computations into the core where the generalizing cortical features should be learned, we devise a novel readout that reduces the number of parameters per neuron in the readout by up to two orders of magnitude compared to the previous state-of-the-art. It does so by taking advantage of retinotopy and learns a Gaussian distribution over the neuron\u2019s receptive field position.  With this new readout we train our network on neural responses from mouse primary visual cortex (V1) and obtain a gain in performance of 7% compared to the previous state-of-the-art network.  We then investigate whether the convolutional core indeed captures general cortical features by using the core in transfer learning to a different animal.  When transferring a core trained on thousands of neurons from various animals and scans we exceed the performance of training directly on that animal by 12%, and outperform a commonly used VGG16 core pre-trained on imagenet by 33%. In addition, transfer learning with our data-driven core is more data-efficient than direct training, achieving the same performance with only 40% of the data. Our model with its novel readout thus sets a new state-of-the-art for neural response prediction in mouse visual cortex from natural images, generalizes between animals, and captures better characteristic cortical features than current task-driven pre-training approaches such as VGG16.",
+    "title": "Generalization in data-driven models of primary visual cortex",
+    "authors": [
+      "Konstantin-Klemens Lurz",
+      "Mohammad Bashiri",
+      "Konstantin Willeke",
+      "Akshay Jagadish",
+      "Eric Wang",
+      "Edgar Y. Walker",
+      "Santiago A Cadena",
+      "Taliah Muhammad",
+      "Erick Cobos",
+      "Andreas S. Tolias",
+      "Alexander S Ecker",
+      "Fabian H. Sinz"
+    ],
+    "emails": [
+      "~Andreas_S._Tolias1",
+      "~Erick_Cobos1",
+      "~Fabian_H._Sinz1",
+      "bcm.edu",
+      "~Konstantin-Klemens_Lurz1",
+      "~Santiago_A_Cadena1",
+      "~Edgar_Y._Walker1",
+      "~Alexander_S_Ecker1",
+      "uni-tuebingen.de",
+      "student.uni-tuebingen.de"
+    ],
+    "rank": 106
+  },
+  {
+    "url": "https://openreview.net/forum?id=S0UdquAnr9k",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      6,
+      8
+    ],
+    "rating": "7.18",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Searching for network width is an effective way to slim deep neural networks with hardware budgets. With this aim, a one-shot supernet is usually leveraged as a performance evaluator to rank the performance \\wrt~different width. Nevertheless, current methods mainly follow a manually fixed weight sharing pattern, which is limited to distinguish the performance gap of different width. In this paper, to better evaluate each width, we propose a locally free weight sharing strategy (CafeNet) accordingly. In CafeNet, weights are more freely shared, and each width is jointly indicated by its base channels and free channels, where free channels are supposed to locate freely in a local zone to better represent each width. Besides, we propose to further reduce the search space by leveraging our introduced FLOPs-sensitive bins. As a result, our CafeNet can be trained stochastically and get optimized within a min-min strategy. Extensive experiments on ImageNet, CIFAR-10, CelebA and MS COCO dataset have verified our superiority comparing to other state-of-the-art baselines. For example, our method can further boost the benchmark NAS network EfficientNet-B0 by 0.41\\% via searching its width more delicately.",
+    "title": "Locally Free Weight Sharing for Network Width Search",
+    "authors": [
+      "Xiu Su",
+      "Shan You",
+      "Tao Huang",
+      "Fei Wang",
+      "Chen Qian",
+      "Changshui Zhang",
+      "Chang Xu"
+    ],
+    "emails": [
+      "~Xiu_Su1",
+      "~Tao_Huang5",
+      "~Changshui_Zhang1",
+      "~Chang_Xu4",
+      "~Fei_Wang9",
+      "~Shan_You3",
+      "~Chen_Qian1"
+    ],
+    "rank": 107
+  },
+  {
+    "url": "https://openreview.net/forum?id=-bxf89v3Nx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      9,
+      5
+    ],
+    "rating": "7.18",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Most supervised machine learning tasks are subject to irreducible prediction errors. Probabilistic predictive models address this limitation by providing probability distributions that represent a belief over plausible targets, rather than point estimates. Such models can be a valuable tool in decision-making under uncertainty, provided that the model output is meaningful and interpretable. Calibrated models guarantee that the probabilistic predictions are neither over- nor under-confident. In the machine learning literature, different measures and statistical tests have been proposed and studied for evaluating the calibration of classification models. For regression problems, however, research has been focused on a weaker condition of calibration based on predicted quantiles for real-valued targets. In this paper, we propose the first framework that unifies calibration evaluation and tests for general probabilistic predictive models. It applies to any such model, including classification and regression models of arbitrary dimension. Furthermore, the framework generalizes existing measures and provides a more intuitive reformulation of a recently proposed framework for calibration in multi-class classification. In particular, we reformulate and generalize the kernel calibration error, its estimators, and hypothesis tests using scalar-valued kernels, and evaluate the calibration of real-valued regression\nproblems.",
+    "title": "Calibration tests beyond classification",
+    "authors": [
+      "David Widmann",
+      "Fredrik Lindsten",
+      "Dave Zachariah"
+    ],
+    "emails": [
+      "liu.se",
+      "it.uu.se",
+      "~David_Widmann1"
+    ],
+    "rank": 108
+  },
+  {
+    "url": "https://openreview.net/forum?id=aDjoksTpXOP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      9
+    ],
+    "rating": "7.18",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Deep networks are often considered to be more expressive than shallow ones in terms of approximation. Indeed, certain functions can be approximated by deep networks provably more efficiently than by shallow ones, however, no tractable algorithms are known for learning such deep models. Separately, a recent line of work has shown that deep networks trained with gradient descent may behave like (tractable) kernel methods in a certain over-parameterized regime, where the kernel is determined by the architecture and initialization, and this paper focuses on approximation for such kernels. We show that for ReLU activations, the kernels derived from deep fully-connected networks have essentially the same approximation properties as their shallow two-layer counterpart, namely the same eigenvalue decay for the corresponding integral operator. This highlights the limitations of the kernel framework for understanding the benefits of such deep architectures. Our main theoretical result relies on characterizing such eigenvalue decays through differentiability properties of the kernel function, which also easily applies to the study of other kernels defined on the sphere.",
+    "title": "Deep Equals Shallow for ReLU Networks in Kernel Regimes",
+    "authors": [
+      "Alberto Bietti",
+      "Francis Bach"
+    ],
+    "emails": [
+      "~Francis_Bach1",
+      "~Alberto_Bietti1"
+    ],
+    "rank": 109
+  },
+  {
+    "url": "https://openreview.net/forum?id=6s7ME_X5_Un",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.17",
+    "confidences": [
+      3,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Interpretation of Deep Neural Networks (DNNs) training as an optimal control problem with nonlinear dynamical systems has received considerable attention recently, yet the algorithmic development remains relatively limited. In this work, we make an attempt along this line by reformulating the training procedure from the trajectory optimization perspective. We first show that most widely-used algorithms for training DNNs can be linked to the Differential Dynamic Programming (DDP), a celebrated second-order method rooted in the Approximate Dynamic Programming. In this vein, we propose a new class of optimizer, DDP Neural Optimizer (DDPNOpt), for training feedforward and convolution networks. DDPNOpt features layer-wise feedback policies which improve convergence and reduce sensitivity to hyper-parameter over existing methods. It outperforms other optimal-control inspired training methods in both convergence and complexity, and is competitive against state-of-the-art first and second order methods. We also observe DDPNOpt has surprising benefit in preventing gradient vanishing. Our work opens up new avenues for principled algorithmic design built upon the optimal control theory.",
+    "title": "DDPNOpt: Differential Dynamic Programming Neural Optimizer",
+    "authors": [
+      "Guan-Horng Liu",
+      "Tianrong Chen",
+      "Evangelos Theodorou"
+    ],
+    "emails": [
+      "~Tianrong_Chen1",
+      "~Guan-Horng_Liu1",
+      "~Evangelos_Theodorou1"
+    ],
+    "rank": 110
+  },
+  {
+    "url": "https://openreview.net/forum?id=xppLmXCbOw1",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      5,
+      7,
+      9,
+      8
+    ],
+    "rating": "7.17",
+    "confidences": [
+      5,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Autonomous agents need large repertoires of skills to act reasonably on new tasks that they have not seen before. However, acquiring these skills using only a stream of high-dimensional, unstructured, and unlabeled observations is a tricky challenge for any autonomous agent. Previous methods have used variational autoencoders to encode a scene into a low-dimensional vector that can be used as a goal for an agent to discover new skills. Nevertheless, in compositional/multi-object environments it is difficult to disentangle all the factors of variation into such a fixed-length representation of the whole scene. We propose to use object-centric representations as a modular and structured observation space, which is learned with a compositional generative world model.\nWe show that the structure in the representations in combination with goal-conditioned attention policies helps the autonomous agent to discover and learn useful skills. These skills can be further combined to address compositional tasks like the manipulation of several different objects.",
+    "title": "Self-supervised Visual Reinforcement Learning with Object-centric Representations",
+    "authors": [
+      "Andrii Zadaianchuk",
+      "Maximilian Seitzer",
+      "Georg Martius"
+    ],
+    "emails": [
+      "~Georg_Martius1",
+      "~Maximilian_Seitzer1",
+      "~Andrii_Zadaianchuk1"
+    ],
+    "rank": 111
+  },
+  {
+    "url": "https://openreview.net/forum?id=xCcdBRQEDW",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      7,
+      9
+    ],
+    "rating": "7.15",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Simulated virtual environments serve as one of the main driving forces behind developing and evaluating skill learning algorithms. However, existing environments typically only simulate rigid body physics. Additionally, the simulation process usually does not provide gradients that might be useful for planning and control optimizations. We introduce a new differentiable physics benchmark called PasticineLab, which includes a diverse collection of soft body manipulation tasks. In each task, the agent uses manipulators to deform the plasticine into a desired configuration. The underlying physics engine supports differentiable elastic and plastic deformation using the DiffTaichi system, posing many under-explored challenges to robotic agents. We evaluate several existing reinforcement learning (RL) methods and gradient-based methods on this benchmark. Experimental results suggest that 1) RL-based approaches struggle to solve most of the tasks efficiently;  2) gradient-based approaches, by optimizing open-loop control sequences with the built-in differentiable physics engine, can rapidly find a solution within tens of iterations, but still fall short on multi-stage tasks that require long-term planning. We expect that PlasticineLab will encourage the development of novel algorithms that combine differentiable physics and RL for more complex physics-based skill learning tasks. PlasticineLab will be made publicly available.",
+    "title": "PlasticineLab: A Soft-Body Manipulation Benchmark with Differentiable Physics",
+    "authors": [
+      "Zhiao Huang",
+      "Yuanming Hu",
+      "Tao Du",
+      "Siyuan Zhou",
+      "Hao Su",
+      "Joshua B. Tenenbaum",
+      "Chuang Gan"
+    ],
+    "emails": [
+      "~Yuanming_Hu1",
+      "~Joshua_B._Tenenbaum1",
+      "gmail.com",
+      "~Chuang_Gan1",
+      "~Tao_Du1",
+      "~Hao_Su1",
+      "~Zhiao_Huang1"
+    ],
+    "rank": 112
+  },
+  {
+    "url": "https://openreview.net/forum?id=LwEQnp6CYev",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      7,
+      8
+    ],
+    "rating": "7.15",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "For many tasks, the reward function is inaccessible to introspection or too complex to be specified procedurally, and must instead be learned from user data. Prior work has evaluated learned reward functions by evaluating policies optimized for the learned reward. However, this method cannot distinguish between the learned reward function failing to reflect user preferences and the policy optimization process failing to optimize the learned reward. Moreover, this method can only tell us about behavior in the evaluation environment, but the reward may incentivize very different behavior in even a slightly different deployment environment. To address these problems, we introduce the Equivalent-Policy Invariant Comparison (EPIC) distance to quantify the difference between two reward functions directly, without a policy optimization step. We prove EPIC is invariant on an equivalence class of reward functions that always induce the same optimal policy. Furthermore, we find EPIC can be efficiently approximated and is more robust than baselines to the choice of coverage distribution. Finally, we show that EPIC distance bounds the regret of optimal policies even under different transition dynamics, and we confirm empirically that it predicts policy training success. Our source code is available at <a href=\"https://github.com/HumanCompatibleAI/evaluating-rewards\" target=\"_blank\" rel=\"nofollow\">https://github.com/HumanCompatibleAI/evaluating-rewards</a>.",
+    "title": "Quantifying Differences in Reward Functions",
+    "authors": [
+      "Adam Gleave",
+      "Michael D Dennis",
+      "Shane Legg",
+      "Stuart Russell",
+      "Jan Leike"
+    ],
+    "emails": [
+      "~Jan_Leike1",
+      "~Adam_Gleave1",
+      "~Shane_Legg1",
+      "~Michael_D_Dennis1",
+      "~Stuart_Russell1"
+    ],
+    "rank": 113
+  },
+  {
+    "url": "https://openreview.net/forum?id=-qh0M9XWxnv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      8
+    ],
+    "rating": "7.14",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "In the recent literature of Graph Neural Networks (GNN), the expressive power of models has been studied through their capability to distinguish if two given graphs are isomorphic or not. Since the graph isomorphism problem is NP-intermediate, and Weisfeiler-Lehman (WL) test can give sufficient but not enough evidence in polynomial time, the theoretical power of GNNs is usually evaluated by the equivalence of WL-test order, followed by an empirical analysis of the models on some reference inductive and transductive datasets. However, such analysis does not account the signal processing pipeline, whose capability is generally evaluated in the spectral domain. In this paper, we argue that a spectral analysis of GNNs behavior can provide a complementary point of view to go one step further in the understanding of GNNs. By bridging the gap between the spectral and spatial design of graph convolutions, we theoretically demonstrate some equivalence of the graph convolution process regardless it is designed in the spatial or the spectral domain. Using this connection, we managed to re-formulate most of the state-of-the-art graph neural networks into one common framework. This general framework allows to lead a spectral analysis of the most popular GNNs, explaining their performance and showing their limits according to spectral point of view. Our theoretical spectral analysis is confirmed by experiments on various graph databases. Furthermore, we demonstrate the necessity of high and/or band-pass filters on a graph dataset, while the majority of GNN is limited to only low-pass and inevitably it fails.",
+    "title": "Analyzing the Expressive Power of Graph Neural Networks in a Spectral Perspective",
+    "authors": [
+      "Muhammet Balcilar",
+      "Guillaume Renton",
+      "Pierre H\u00e9roux",
+      "Benoit Ga\u00fcz\u00e8re",
+      "S\u00e9bastien Adam",
+      "Paul Honeine"
+    ],
+    "emails": [
+      "insa-rouen.fr",
+      "~Paul_Honeine1",
+      "~Muhammet_Balcilar1",
+      "~S\u00e9bastien_Adam1",
+      "gmail.com",
+      "univ-rouen.fr"
+    ],
+    "rank": 114
+  },
+  {
+    "url": "https://openreview.net/forum?id=KpfasTaLUpq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      7,
+      5,
+      7
+    ],
+    "rating": "7.13",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Much recent effort has been invested in non-autoregressive neural machine translation, which appears to be an efficient alternative to state-of-the-art autoregressive machine translation on modern GPUs.  In contrast to the latter, where generation is sequential, the former allows generation to be parallelized across target token positions. Some of the latest non-autoregressive models have achieved impressive translation quality-speed tradeoffs compared to autoregressive baselines. In this work, we reexamine this tradeoff and argue that autoregressive baselines can be substantially sped up without loss in accuracy. Specifically, we study autoregressive models with encoders and decoders of varied depths. Our extensive experiments show that given a sufficiently deep encoder, a single-layer autoregressive decoder can substantially outperform strong non-autoregressive models with comparable inference speed. We show that the speed disadvantage for autoregressive baselines compared to non-autoregressive methods has been overestimated in three aspects: suboptimal layer allocation, insufficient speed measurement, and lack of knowledge distillation. Our results establish a new protocol for future research toward fast, accurate machine translation. Our code is available at <a href=\"https://github.com/jungokasai/deep-shallow\" target=\"_blank\" rel=\"nofollow\">https://github.com/jungokasai/deep-shallow</a>.",
+    "title": "Deep Encoder, Shallow Decoder: Reevaluating Non-autoregressive Machine Translation",
+    "authors": [
+      "Jungo Kasai",
+      "Nikolaos Pappas",
+      "Hao Peng",
+      "James Cross",
+      "Noah Smith"
+    ],
+    "emails": [
+      "~Nikolaos_Pappas1",
+      "~James_Cross3",
+      "~Hao_Peng4",
+      "~Noah_Smith1",
+      "~Jungo_Kasai1"
+    ],
+    "rank": 115
+  },
+  {
+    "url": "https://openreview.net/forum?id=-QxT4mJdijq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      9,
+      5
+    ],
+    "rating": "7.13",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Many successful deep learning architectures are equivariant to certain transformations in order to conserve parameters and improve generalization: most famously, convolution layers are equivariant to shifts of the input. This approach only works when practitioners know the symmetries of the task and can manually construct an architecture with the corresponding equivariances. Our goal is an approach for learning equivariances from data, without needing to design custom task-specific architectures. We present a method for learning and encoding equivariances into networks by learning corresponding parameter sharing patterns from data. Our method can provably represent equivariance-inducing parameter sharing for any finite group of symmetry transformations. Our experiments suggest that it can automatically learn to encode equivariances to common transformations used in image processing tasks.",
+    "title": "Meta-learning Symmetries by Reparameterization",
+    "authors": [
+      "Allan Zhou",
+      "Tom Knowles",
+      "Chelsea Finn"
+    ],
+    "emails": [
+      "~Chelsea_Finn1",
+      "stanford.edu",
+      "~Allan_Zhou1"
+    ],
+    "rank": 116
+  },
+  {
+    "url": "https://openreview.net/forum?id=7EDgLu9reQD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      8,
+      6,
+      7
+    ],
+    "rating": "7.12",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Learning 3D geometry directly from raw data, such as point clouds, triangle soups, or unoriented meshes is still a challenging task that feeds many downstream computer vision and graphics applications. \n\nIn this paper, we introduce SALD: a method for learning implicit neural representations of shapes directly from raw data. We generalize sign agnostic learning (SAL) to include derivatives: given an unsigned distance function to the input raw data, we advocate a novel sign agnostic regression loss, incorporating both pointwise values and gradients of the unsigned distance function. Optimizing this loss leads to a signed implicit function solution, the zero level set of which is a high quality and valid manifold approximation to the input 3D data. The motivation behind SALD is that incorporating derivatives in a regression loss leads to a lower sample complexity, and consequently better fitting. In addition, we provide empirical evidence, as well as theoretical motivation in 2D that SAL enjoys a minimal surface property, favoring minimal area solutions. More importantly, we are able to show that this property still holds for SALD, i.e.,  with derivatives included.\n\nWe demonstrate the efficacy of SALD for shape space learning on two challenging datasets: ShapeNet that contains inconsistent orientation and non-manifold meshes, and D-Faust that contains raw 3D scans (triangle soups). On both these datasets, we present state-of-the-art results.",
+    "title": "SALD: Sign Agnostic Learning with Derivatives",
+    "authors": [
+      "Matan Atzmon",
+      "Yaron Lipman"
+    ],
+    "emails": [
+      "~Yaron_Lipman1",
+      "~Matan_Atzmon1"
+    ],
+    "rank": 117
+  },
+  {
+    "url": "https://openreview.net/forum?id=Xv_s64FiXTv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      8,
+      6,
+      8
+    ],
+    "rating": "7.12",
+    "confidences": [
+      5,
+      2,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Action-value estimation is a critical component of many reinforcement learning (RL) methods whereby sample complexity relies heavily on how fast a good estimator for action value can be learned. By viewing this problem through the lens of representation learning, good representations of both state and action can facilitate action-value estimation. While advances in deep learning have seamlessly driven progress in learning state representations, given the specificity of the notion of agency to RL, little attention has been paid to learning action representations. We conjecture that leveraging the combinatorial structure of multi-dimensional action spaces is a key ingredient for learning good representations of action. To test this, we set forth the action hypergraph networks framework---a class of functions for learning action representations in multi-dimensional discrete action spaces with a structural inductive bias. Using this framework we realise an agent class based on a combination with deep Q-networks, which we dub hypergraph Q-networks. We show the effectiveness of our approach on a myriad of domains: illustrative prediction problems under minimal confounding effects, Atari 2600 games, and discretised physical control benchmarks.",
+    "title": "Learning to Represent Action Values as a Hypergraph on the Action Vertices",
+    "authors": [
+      "Arash Tavakoli",
+      "Mehdi Fatemi",
+      "Petar Kormushev"
+    ],
+    "emails": [
+      "~Arash_Tavakoli1",
+      "~Petar_Kormushev1",
+      "~Mehdi_Fatemi1"
+    ],
+    "rank": 118
+  },
+  {
+    "url": "https://openreview.net/forum?id=EqoXe2zmhrh",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      9,
+      6,
+      7
+    ],
+    "rating": "7.12",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "The dominant paradigm for learning video-text representations \u2013 noise contrastive learning \u2013 increases the similarity of the representations of pairs of samples that are known to be related, such as text and video from the same sample, and pushes away the representations of all other pairs. We posit that this last behaviour is too strict, enforcing dissimilar representations even for samples that are semantically-related \u2013 for example, visually similar videos or ones that share the same depicted action. In this paper, we propose a novel method that alleviates this by leveraging a generative model to naturally push these related samples together: each sample\u2019s caption must be reconstructed as a weighted combination of a support set of visual representations. This simple idea ensures that representations are not overly-specialized to individual samples, are reusable across the dataset, and results in representations that explicitly encode semantics shared between samples, unlike noise contrastive learning. Our proposed method outperforms others by a large margin on MSR-VTT, VATEX, ActivityNet, and MSVD for video-to-text and text-to-video retrieval.",
+    "title": "Support-set bottlenecks for video-text representation learning",
+    "authors": [
+      "Mandela Patrick",
+      "Po-Yao Huang",
+      "Yuki Asano",
+      "Florian Metze",
+      "Alexander G Hauptmann",
+      "Joao F. Henriques",
+      "Andrea Vedaldi"
+    ],
+    "emails": [
+      "~Joao_F._Henriques1",
+      "~Andrea_Vedaldi1",
+      "~Yuki_Asano1",
+      "~Florian_Metze1",
+      "~Alexander_G_Hauptmann1",
+      "~Po-Yao_Huang1",
+      "~Mandela_Patrick1"
+    ],
+    "rank": 119
+  },
+  {
+    "url": "https://openreview.net/forum?id=p3_z68kKrus",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      6,
+      8,
+      6
+    ],
+    "rating": "7.12",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We study the average CV Leave One Out stability of kernel ridge-less regression and derive corresponding risk bounds. We show that the interpolating solution with minimum norm minimizes a bound on CV Leave One Out stability, which in turn is controlled by the condition number of the empirical kernel matrix. The latter can be characterized in the asymptotic regime where both the dimension and cardinality of the data go to infinity. Under the assumption of random kernel matrices, the corresponding test error should be expected to follow a double descent curve.",
+    "title": "For interpolating kernel machines, minimizing the norm of the ERM solution minimizes stability",
+    "authors": [
+      "Akshay Rangamani",
+      "Lorenzo Rosasco",
+      "Tomaso Poggio"
+    ],
+    "emails": [
+      "~Akshay_Rangamani1",
+      "~Lorenzo_Rosasco1",
+      "~Tomaso_Poggio1"
+    ],
+    "rank": 120
+  },
+  {
+    "url": "https://openreview.net/forum?id=A5VV3UyIQz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      7
+    ],
+    "rating": "7.11",
+    "confidences": [
+      1,
+      4,
+      4
+    ],
+    "abstract": "Deep one-class classification variants for anomaly detection learn a mapping that concentrates nominal samples in feature space causing anomalies to be mapped away. Because this transformation is highly non-linear, finding interpretations poses a significant challenge. In this paper we present an explainable deep one-class classification method, Fully Convolutional Data Description (FCDD), where the mapped samples are themselves also an explanation heatmap. FCDD yields competitive detection performance and provides reasonable explanations on common anomaly detection benchmarks with CIFAR-10 and ImageNet. On MVTec-AD, a recent manufacturing dataset offering ground-truth anomaly maps, FCDD sets a new state of the art in the unsupervised setting. Our method can incorporate ground-truth anomaly maps during training and using even a few of these (~5) improves performance significantly. Finally, using FCDD's explanations we demonstrate the vulnerability of deep one-class classification models to spurious image features such as image watermarks.",
+    "title": "Explainable Deep One-Class Classification",
+    "authors": [
+      "Philipp Liznerski",
+      "Lukas Ruff",
+      "Robert A. Vandermeulen",
+      "Billy Joe Franks",
+      "Marius Kloft",
+      "Klaus Robert Muller"
+    ],
+    "emails": [
+      "~Klaus_Robert_Muller1",
+      "cs.uni-kl.de",
+      "~Marius_Kloft1",
+      "~Lukas_Ruff1",
+      "~Robert_A._Vandermeulen2",
+      "~Philipp_Liznerski1"
+    ],
+    "rank": 121
+  },
+  {
+    "url": "https://openreview.net/forum?id=S724o4_WB3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      7
+    ],
+    "rating": "7.10",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "While second order optimizers such as natural gradient descent (NGD) often speed up optimization, their effect on generalization has been called into question. This work presents a more nuanced view on how the \\textit{implicit bias} of optimizers affects the comparison of generalization properties. \nWe provide an exact asymptotic bias-variance decomposition of the generalization error of preconditioned ridgeless regression in the overparameterized regime, and consider the inverse population Fisher information matrix (used in NGD) as a particular example. We determine the optimal preconditioner <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-b mjx-i\"><mjx-c class=\"mjx-c1D477 TEX-BI\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"bold-italic\">P</mi></math></mjx-assistive-mml></mjx-container> for both the bias and variance, and find that the relative generalization performance of different optimizers depends on label noise and ``shape'' of the signal (true parameters): when the labels are noisy, the model is misspecified, or the signal is misaligned with the features, NGD can achieve lower risk; conversely, GD generalizes better under clean labels, a well-specified model, or aligned signal. \nBased on this analysis, we discuss several approaches to manage the bias-variance tradeoff, and the potential benefit of interpolating between first- and second-order updates. We then extend our analysis to regression in the reproducing kernel Hilbert space and demonstrate that preconditioning can lead to more efficient decrease in the population risk. Lastly, we empirically compare the generalization error of first- and second-order optimizers in neural network experiments, and observe robust trends matching our theoretical analysis. ",
+    "title": "When does preconditioning help or hurt generalization?",
+    "authors": [
+      "Shun-ichi Amari",
+      "Jimmy Ba",
+      "Roger Baker Grosse",
+      "Xuechen Li",
+      "Atsushi Nitanda",
+      "Taiji Suzuki",
+      "Denny Wu",
+      "Ji Xu"
+    ],
+    "emails": [
+      "~Shun-ichi_Amari1",
+      "~Atsushi_Nitanda1",
+      "~Ji_Xu1",
+      "~Taiji_Suzuki1",
+      "~Xuechen_Li1",
+      "~Denny_Wu2",
+      "~Roger_Baker_Grosse1",
+      "~Jimmy_Ba1"
+    ],
+    "rank": 122
+  },
+  {
+    "url": "https://openreview.net/forum?id=EGdFhBzmAwB",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      7,
+      8
+    ],
+    "rating": "7.09",
+    "confidences": [
+      2,
+      2,
+      2,
+      5
+    ],
+    "abstract": "This paper theoretically investigates the following empirical phenomenon: given a high-complexity network with poor generalization bounds, one can distill it into a network with nearly identical predictions but low complexity and vastly smaller generalization bounds.  The main contribution is an analysis showing that the original network inherits this good generalization bound from its distillation, assuming the use of well-behaved data augmentation.  This bound is presented both in an abstract and in a concrete form, the latter complemented by a reduction technique to handle modern computation graphs featuring convolutional layers, fully-connected layers, and skip connections, to name a few.  To round out the story, a (looser) classical uniform convergence analysis of compression is also presented, as well as a variety of experiments on cifar and mnist demonstrating similar generalization performance between the original network and its distillation.  \n",
+    "title": "Generalization bounds via distillation",
+    "authors": [
+      "Daniel Hsu",
+      "Ziwei Ji",
+      "Matus Telgarsky",
+      "Lan Wang"
+    ],
+    "emails": [
+      "~Daniel_Hsu1",
+      "~Matus_Telgarsky1",
+      "~Lan_Wang4",
+      "~Ziwei_Ji1"
+    ],
+    "rank": 123
+  },
+  {
+    "url": "https://openreview.net/forum?id=St1giarCHLP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      6
+    ],
+    "rating": "7.09",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps in order to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a particular Markovian diffusion process. We generalize DDPMs via a class of non-Markovian diffusion processes that lead to the same training objective. These non-Markovian processes can correspond to generative processes that are deterministic, giving rise to implicit models that produce high quality samples much faster. We empirically demonstrate that DDIMs can produce high quality samples <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>50</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, perform semantically meaningful image interpolation directly in the latent space, and reconstruct observations with very low error. ",
+    "title": "Denoising Diffusion Implicit Models",
+    "authors": [
+      "Jiaming Song",
+      "Chenlin Meng",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Stefano_Ermon1",
+      "stanford.edu",
+      "~Jiaming_Song1"
+    ],
+    "rank": 124
+  },
+  {
+    "url": "https://openreview.net/forum?id=g-wu9TMPODo",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "7.08",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We investigate two causes for adversarial vulnerability in deep neural networks: bad data and (poorly) trained models. When trained with SGD, deep neural networks essentially achieve zero training error, even in the presence of label noise, while also exhibiting good generalization on natural test data, something referred to as benign overfitting (Bartlett et al., 2020; Chatterji &amp; Long, 2020).  However, these models are vulnerable to adversarial attacks. We identify label noise as one of the causes for adversarial vulnerability, and provide theoretical and empirical evidence in support of this. Surprisingly, we find several instances of label noise in datasets such as MNIST and CIFAR, and that robustly trained models incur training error on some of these, i.e. they don\u2019t fit the noise. However, removing noisy labels alone does not suffice to achieve adversarial robustness. We conjecture that in part sub-optimal representation learning is also responsible for adversarial vulnerability. By means of simple theoretical setups, we show how the choice of representation can drastically affect adversarial robustness.",
+    "title": "How Benign is Benign Overfitting ?",
+    "authors": [
+      "Amartya Sanyal",
+      "Puneet K. Dokania",
+      "Varun Kanade",
+      "Philip Torr"
+    ],
+    "emails": [
+      "~Puneet_K._Dokania1",
+      "~Amartya_Sanyal1",
+      "~Varun_Kanade1",
+      "~Philip_Torr1"
+    ],
+    "rank": 125
+  },
+  {
+    "url": "https://openreview.net/forum?id=-TwO99rbVRu",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      7
+    ],
+    "rating": "7.08",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Recent advances in semi-supervised learning (SSL) demonstrate that a combination of consistency regularization and pseudo-labeling can effectively improve image classification accuracy in the low-data regime. Compared to classification, semantic segmentation tasks require much more intensive labeling costs. Thus, these tasks greatly benefit from data-efficient training methods. However, structured outputs in segmentation render particular difficulties (e.g., designing pseudo-labeling and augmentation) to apply existing SSL strategies. To address this problem, we present a simple and novel re-design of pseudo-labeling to generate well-calibrated structured pseudo labels for training with unlabeled or weakly-labeled data. Our proposed pseudo-labeling strategy is network structure agnostic to apply in a one-stage consistency training framework. We demonstrate the effectiveness of the proposed pseudo-labeling strategy in both low-data and high-data regimes. Extensive experiments have validated that pseudo labels generated from wisely fusing diverse sources and strong data augmentation are crucial to consistency training for segmentation. The source code will be released.",
+    "title": "PseudoSeg: Designing Pseudo Labels for Semantic Segmentation",
+    "authors": [
+      "Yuliang Zou",
+      "Zizhao Zhang",
+      "Han Zhang",
+      "Chun-Liang Li",
+      "Xiao Bian",
+      "Jia-Bin Huang",
+      "Tomas Pfister"
+    ],
+    "emails": [
+      "~Jia-Bin_Huang1",
+      "~Yuliang_Zou1",
+      "~Zizhao_Zhang3",
+      "~Xiao_Bian3",
+      "~Chun-Liang_Li1",
+      "~Tomas_Pfister1",
+      "~Han_Zhang1"
+    ],
+    "rank": 126
+  },
+  {
+    "url": "https://openreview.net/forum?id=rWZz3sJfCkm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      8
+    ],
+    "rating": "7.08",
+    "confidences": [
+      2,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Many problems across computer vision and the natural sciences require the analysis of spherical data, for which representations may be learned efficiently by encoding equivariance to rotational symmetries.  We present a generalized spherical CNN framework that encompasses various existing approaches and allows them to be leveraged alongside each other.  The only existing non-linear spherical CNN layer that is strictly equivariant has complexity <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>C</mi><mn>2</mn></msup><msup><mi>L</mi><mn>5</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>C</mi></math></mjx-assistive-mml></mjx-container> is a measure of representational capacity and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> the spherical harmonic bandlimit.  Such a high computational cost often prohibits the use of strictly equivariant spherical CNNs.  We develop two new strictly equivariant layers with reduced complexity <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>C</mi><msup><mi>L</mi><mn>4</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>C</mi><msup><mi>L</mi><mn>3</mn></msup><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>L</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, making larger, more expressive models computationally feasible.  Moreover, we adopt efficient sampling theory to achieve further computational savings. We show that these developments allow the construction of more expressive hybrid models that achieve state-of-the-art accuracy and parameter efficiency on spherical benchmark problems.",
+    "title": "Efficient Generalized Spherical CNNs",
+    "authors": [
+      "Oliver Cobb",
+      "Christopher G. R. Wallis",
+      "Augustine N. Mavor-Parker",
+      "Augustin Marignier",
+      "Matthew A. Price",
+      "Mayeul d'Avezac",
+      "Jason McEwen"
+    ],
+    "emails": [
+      "~Oliver_Cobb1",
+      "~Jason_McEwen1",
+      "kagenova.com"
+    ],
+    "rank": 127
+  },
+  {
+    "url": "https://openreview.net/forum?id=HHiiQKWsOcV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      8
+    ],
+    "rating": "7.07",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In attempts to produce machine learning models less reliant on spurious patterns in NLP datasets, researchers have recently proposed curating counterfactually augmented data (CAD) via a human-in-the-loop process in which given some documents and their (initial) labels, humans must revise the text to make a counterfactual label applicable. Importantly, edits that are not necessary to flip the applicable label are prohibited. Models trained on the augmented (original and revised) data appear, empirically, to rely less on semantically irrelevant words and to generalize better out of domain. While this work draws loosely on causal thinking, the underlying causal model (even at an abstract level) and the principles underlying the observed out-of-domain improvements remain unclear. In this paper, we introduce a toy analog based on linear Gaussian models, observing interesting relationships between causal models, measurement noise, out-of-domain generalization, and reliance on spurious signals. Our analysis provides some insights that help to explain the efficacy of CAD. Moreover, we develop the hypothesis that while adding noise to causal features should degrade both in-domain and out-of-domain performance, adding noise to non-causal features should lead to relative improvements in out-of-domain performance. This idea inspires a speculative test for determining whether a feature attribution technique has identified the causal spans. If adding noise (e.g., by random word flips) to the highlighted spans degrades both in-domain and out-of-domain performance on a battery of challenge datasets, but adding noise to the complement gives improvements out-of-domain, this suggests we have identified causal spans. Thus, we present a large scale empirical study comparing spans edited to create CAD to those selected by attention and saliency maps. Across numerous challenge domains and models, we find that the hypothesized phenomenon is pronounced for CAD.",
+    "title": "Explaining the Efficacy of Counterfactually Augmented Data",
+    "authors": [
+      "Divyansh Kaushik",
+      "Amrith Setlur",
+      "Eduard H Hovy",
+      "Zachary Chase Lipton"
+    ],
+    "emails": [
+      "~Eduard_H_Hovy1",
+      "~Zachary_Chase_Lipton1",
+      "~Amrith_Setlur1",
+      "~Divyansh_Kaushik1"
+    ],
+    "rank": 128
+  },
+  {
+    "url": "https://openreview.net/forum?id=9ITXiTrAoT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      7
+    ],
+    "rating": "7.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Language models must capture statistical dependencies between words at timescales ranging from very short to very long. Earlier work has demonstrated that dependencies in natural language tend to decay with distance between words according to a power law. However, it is unclear how this knowledge can be used for analyzing or designing neural network language models. In this work, we derived a theory for how the memory gating mechanism in long short-term memory (LSTM) language models can capture power law decay. We found that unit timescales within an LSTM, which are determined by the forget gate bias, should follow an Inverse Gamma distribution. Experiments then showed that LSTM language models trained on natural English text learn to approximate this theoretical distribution. Further, we found that explicitly imposing the theoretical distribution upon the model during training yielded better language model perplexity overall, with particular improvements for predicting low-frequency (rare) words. Moreover, the explicit multi-timescale model selectively routes information about different types of words through units with different timescales, potentially improving model interpretability. These results demonstrate the importance of careful, theoretically-motivated analysis of memory and timescale in language models.",
+    "title": "Multi-timescale Representation Learning in LSTM Language Models",
+    "authors": [
+      "Shivangi Mahto",
+      "Vy Ai Vo",
+      "Javier S. Turek",
+      "Alexander Huth"
+    ],
+    "emails": [
+      "utexas.edu",
+      "~Javier_S._Turek1",
+      "intel.com",
+      "~Alexander_Huth1"
+    ],
+    "rank": 129
+  },
+  {
+    "url": "https://openreview.net/forum?id=XJk19XzGq2J",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      8,
+      6
+    ],
+    "rating": "7.07",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "It is widely believed that natural image data exhibits low-dimensional structure despite the high dimensionality of conventional pixel representations.  This idea underlies a common intuition for the remarkable success of deep learning in computer vision. In this work, we apply dimension estimation tools to popular datasets and investigate the role of low-dimensional structure in deep learning.  We find that common natural image datasets indeed have very low intrinsic dimension relative to the high number of pixels in the images.  Additionally, we find that low dimensional datasets are easier for neural networks to learn, and models solving these tasks generalize better from training to test data.   Along the way,  we develop a technique for validating our dimension estimation tools on synthetic data generated by GANs allowing us to actively manipulate the intrinsic dimension by controlling the image generation process. Code for our experiments may be found  \\href{<a href=\"https://github.com/ppope/dimensions}{here}\" target=\"_blank\" rel=\"nofollow\">https://github.com/ppope/dimensions}{here}</a>.",
+    "title": "The Intrinsic Dimension of Images and Its Impact on Learning",
+    "authors": [
+      "Phil Pope",
+      "Chen Zhu",
+      "Ahmed Abdelkader",
+      "Micah Goldblum",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Ahmed_Abdelkader1",
+      "~Chen_Zhu2",
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1",
+      "~Phil_Pope1"
+    ],
+    "rank": 130
+  },
+  {
+    "url": "https://openreview.net/forum?id=YLewtnvKgR7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      9,
+      6,
+      6
+    ],
+    "rating": "7.07",
+    "confidences": [
+      2,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Predictive uncertainty estimation is an essential next step for the reliable deployment of deep object detectors in safety-critical tasks. In this work, we focus on estimating predictive distributions for bounding box regression output with variance networks. We show that in the context of object detection, training variance networks with negative log likelihood (NLL) can lead to high entropy predictive distributions regardless of the correctness of the output mean. We propose to use the energy score as a non-local proper scoring rule and find that when used for training, the energy score leads to better calibrated and lower entropy predictive distributions than NLL. We also address the widespread use of non-proper scoring metrics for evaluating predictive distributions from deep object detectors by proposing an alternate evaluation approach founded on proper scoring rules. Using the proposed evaluation tools, we show that although variance networks can be used to produce high quality predictive distributions, ad-hoc approaches used by seminal object detectors for choosing regression targets during training do not provide wide enough data support for reliable variance learning. We hope that our work helps shift evaluation in probabilistic object detection to better align with predictive uncertainty evaluation in other machine learning domains. Code for all models, evaluation, and datasets is available at: <a href=\"https://github.com/asharakeh/probdet.git\" target=\"_blank\" rel=\"nofollow\">https://github.com/asharakeh/probdet.git</a>.",
+    "title": "Estimating and Evaluating Regression Predictive Uncertainty in Deep Object Detectors",
+    "authors": [
+      "Ali Harakeh",
+      "Steven L. Waslander"
+    ],
+    "emails": [
+      "~Ali_Harakeh1",
+      "~Steven_L._Waslander1"
+    ],
+    "rank": 131
+  },
+  {
+    "url": "https://openreview.net/forum?id=bnY0jm4l59",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.07",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep learning is slowly, but steadily, hitting a memory bottleneck. While the tensor computation in top-of-the-line GPUs increased by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>32</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> over the last five years, the total available memory only grew by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2.5</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>. This prevents researchers from exploring larger architectures, as training large networks requires more memory for storing intermediate outputs. In this paper, we present MONeT, an automatic framework that minimizes both the memory footprint and computational overhead of deep networks. MONeT jointly optimizes the checkpointing schedule and the implementation of various operators. MONeT is able to outperform all prior hand-tuned operations as well as automated checkpointing. MONeT reduces the overall memory requirement by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> for various PyTorch models, with a 9-16<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> overhead in computation. For the same computation cost, MONeT requires 1.2-1.8<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> less memory than current state-of-the-art automated checkpointing frameworks. Our code will be made publicly available upon acceptance.",
+    "title": "Memory Optimization for Deep Networks",
+    "authors": [
+      "Aashaka Shah",
+      "Chao-Yuan Wu",
+      "Jayashree Mohan",
+      "Vijay Chidambaram",
+      "Philipp Kraehenbuehl"
+    ],
+    "emails": [
+      "~Vijay_Chidambaram1",
+      "~Aashaka_Shah1",
+      "~Philipp_Kraehenbuehl1",
+      "~Chao-Yuan_Wu1",
+      "cs.utexas.edu"
+    ],
+    "rank": 132
+  },
+  {
+    "url": "https://openreview.net/forum?id=PS3IMnScugk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "7.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Flexible neural sequence models outperform grammar- and automaton-based counterparts on a variety of tasks. However, neural models perform poorly in settings requiring compositional generalization beyond the training data\u2014particularly to rare or unseen subsequences. Past work has found symbolic scaffolding (e.g. grammars or automata) essential in these settings. We describe R&amp;R, a learned data augmentation scheme that enables a large category of compositional generalizations without appeal to latent symbolic structure. R&amp;R has two components: recombination of original training examples via a prototype-based generative model and resampling of generated examples to encourage extrapolation. Training an ordinary neural sequence model on a dataset augmented with recombined and resampled examples significantly improves generalization in two language processing problems\u2014instruction following (SCAN) and morphological analysis (SIGMORPHON 2018)\u2014where R&amp;R enables learning of new constructions and tenses from as few as eight initial examples.",
+    "title": "Learning to Recombine and Resample Data For Compositional Generalization",
+    "authors": [
+      "Ekin Aky\u00fcrek",
+      "Afra Feyza Aky\u00fcrek",
+      "Jacob Andreas"
+    ],
+    "emails": [
+      "~Jacob_Andreas1",
+      "~Ekin_Aky\u00fcrek1",
+      "~Afra_Feyza_Aky\u00fcrek1"
+    ],
+    "rank": 133
+  },
+  {
+    "url": "https://openreview.net/forum?id=q3KSThy2GwB",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "7.06",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recurrent neural networks are usually trained with backpropagation through time, which requires storing a complete history of network states, and prohibits updating the weights \"online\" (after every timestep). Real Time Recurrent Learning (RTRL) eliminates the need for history storage and allows for online weight updates, but does so at the expense of computational costs that are quartic in the state size. This renders RTRL training intractable for all but the smallest networks, even ones that are made highly sparse.\nWe introduce the Sparse n-step Approximation (SnAp) to the RTRL influence matrix. SnAp only tracks the influence of a parameter on hidden units that are reached by the computation graph within <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> timesteps of the recurrent core. SnAp with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>=</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container> is no more expensive than backpropagation but allows training on arbitrarily long sequences. We find that it substantially outperforms other RTRL approximations with comparable costs such as Unbiased Online Recurrent Optimization. For highly sparse networks, SnAp with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>=</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container> remains tractable and can outperform backpropagation through time in terms of learning speed when updates are done online.",
+    "title": "Practical Real Time Recurrent Learning with a Sparse Approximation",
+    "authors": [
+      "Jacob Menick",
+      "Erich Elsen",
+      "Utku Evci",
+      "Simon Osindero",
+      "Karen Simonyan",
+      "Alex Graves"
+    ],
+    "emails": [
+      "~Simon_Osindero1",
+      "~Alex_Graves1",
+      "~Utku_Evci1",
+      "~Jacob_Menick1",
+      "~Erich_Elsen1",
+      "~Karen_Simonyan1"
+    ],
+    "rank": 134
+  },
+  {
+    "url": "https://openreview.net/forum?id=rumv7QmLUue",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      9,
+      7
+    ],
+    "rating": "7.06",
+    "confidences": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Recent network pruning methods focus on pruning models early-on in training. To estimate the impact of removing a parameter, these methods use importance measures that were originally designed to prune trained models. Despite lacking justification for their use early-on in training, such measures result in surprisingly low accuracy loss. To better explain this behavior, we develop a general framework that uses gradient flow to unify state-of-the-art importance measures through the norm of model parameters. We use this framework to determine the relationship between pruning measures and evolution of model parameters, establishing several results related to pruning models early-on in training: (i) magnitude-based pruning removes parameters that contribute least to reduction in loss, resulting in models that converge faster than magnitude-agnostic methods; (ii) loss-preservation based pruning preserves first-order model evolution dynamics and its use is therefore justified for pruning minimally trained models; and (iii) gradient-norm based pruning affects second-order model evolution dynamics, such that increasing gradient norm via pruning can produce poorly performing models. We validate our claims on several VGG-13, MobileNet-V1, and ResNet-56 models trained on CIFAR-10/CIFAR-100.",
+    "title": "A Gradient Flow Framework For Analyzing Network Pruning",
+    "authors": [
+      "Ekdeep Singh Lubana",
+      "Robert Dick"
+    ],
+    "emails": [
+      "~Ekdeep_Singh_Lubana1",
+      "~Robert_Dick1"
+    ],
+    "rank": 135
+  },
+  {
+    "url": "https://openreview.net/forum?id=jHefDGsorp5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      7
+    ],
+    "rating": "7.06",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Optimizing molecules for desired properties is a fundamental yet challenging task in chemistry, material science, and drug discovery. This paper develops a novel algorithm for optimizing molecular properties via an Expectation-Maximization (EM) like explainable evolutionary process. The algorithm is designed to mimic human experts in the process of searching for desirable molecules and alternate between two stages: the first stage on explainable local search which identifies rationales, i.e., critical subgraph patterns accounting for desired molecular properties, and the second stage on molecule completion which explores the larger space of molecules containing good rationales. We test our approach against various baselines on a real-world multi-property optimization task where each method is given the same number of queries to the property oracle. We show that our evolution-by-explanation algorithm is 79% better than the best baseline in terms of a generic metric combining aspects such as success rate, novelty, and diversity. Human expert evaluation on optimized molecules shows that 60% of top molecules obtained from our methods are deemed successful. ",
+    "title": "Molecule Optimization by Explainable Evolution",
+    "authors": [
+      "Binghong Chen",
+      "Tianzhe Wang",
+      "Chengtao Li",
+      "Hanjun Dai",
+      "Le Song"
+    ],
+    "emails": [
+      "~Binghong_Chen1",
+      "~Le_Song1",
+      "~Tianzhe_Wang1",
+      "~Hanjun_Dai1",
+      "~Chengtao_Li1"
+    ],
+    "rank": 136
+  },
+  {
+    "url": "https://openreview.net/forum?id=LVotkZmYyDi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      8,
+      5,
+      7
+    ],
+    "rating": "7.06",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "The gradient descent-ascent (GDA) algorithm has been widely applied to solve minimax optimization problems. In order to achieve convergent policy parameters for minimax optimization, it is important that GDA generates convergent variable sequences rather than convergent sequences of function value or gradient norm. However, the variable convergence of GDA has been proved only under convexity geometries, and it is lack of understanding in general nonconvex minimax optimization. This paper fills such a gap by studying the convergence of a more general proximal-GDA for regularized nonconvex-strongly-concave minimax optimization. Specifically, we show that proximal-GDA admits a novel Lyapunov function, which monotonically decreases in the minimax optimization process and drives the variable sequences to a critical point. By leveraging this Lyapunov function and the KL geometry that parameterizes the local geometries of general nonconvex functions, we formally establish the variable convergence of proximal-GDA to a certain critical point <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mo>\u2217</mo></msup></math></mjx-assistive-mml></mjx-container>, i.e., <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>x</mi><mi>t</mi></msub><mo accent=\"false\" stretchy=\"false\">\u2192</mo><msup><mi>x</mi><mo>\u2217</mo></msup><mo>,</mo><msub><mi>y</mi><mi>t</mi></msub><mo accent=\"false\" stretchy=\"false\">\u2192</mo><msup><mi>y</mi><mo>\u2217</mo></msup><mo stretchy=\"false\">(</mo><msup><mi>x</mi><mo>\u2217</mo></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. Furthermore, over the full spectrum of the KL-parameterized geometry, we show that proximal-GDA achieves different types of convergence rates ranging from sublinear convergence up to finite-step convergence, depending on the geometry associated with the KL parameter. This is the first theoretical result on the variable convergence for nonconvex minimax optimization. ",
+    "title": "Proximal Gradient Descent-Ascent: Variable Convergence under K\u0141 Geometry",
+    "authors": [
+      "Ziyi Chen",
+      "Yi Zhou",
+      "Tengyu Xu",
+      "Yingbin Liang"
+    ],
+    "emails": [
+      "~Ziyi_Chen2",
+      "~Tengyu_Xu1",
+      "~Yi_Zhou2",
+      "~Yingbin_Liang1"
+    ],
+    "rank": 137
+  },
+  {
+    "url": "https://openreview.net/forum?id=w2mYg3d0eot",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      6,
+      7
+    ],
+    "rating": "7.06",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "This paper studies the behaviour of the stochastic subgradient descent (SSGD) method applied to over-parameterized nonsmooth optimization problems that satisfy an interpolation condition. By leveraging the composite structure of the empirical risk minimization problems, we prove that SSGD converges, respectively, with rates <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03f5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03f5</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for convex and strongly-convex objectives when interpolation holds. These rates coincide with established rates for the stochastic gradient descent (SGD) method applied to smooth problems that also satisfy an interpolation condition. Our analysis provides a partial explanation for the empirical observation that sometimes SGD and SSGD behave similarly for training smooth and nonsmooth machine learning models. We also prove that the rate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03f5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> is optimal for the subgradient method in the convex and interpolation setting.",
+    "title": "Fast convergence of stochastic subgradient method under interpolation",
+    "authors": [
+      "Huang Fang",
+      "Zhenan Fan",
+      "Michael Friedlander"
+    ],
+    "emails": [
+      "~Huang_Fang1",
+      "cs.ubc.ca"
+    ],
+    "rank": 138
+  },
+  {
+    "url": "https://openreview.net/forum?id=hsFN92eQEla",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      8
+    ],
+    "rating": "7.06",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Modern neural architectures for classification tasks are trained using the cross-entropy loss, which is widely believed to be empirically superior to the square loss. In this work we provide evidence indicating that this belief may not be well-founded. \nWe explore several major neural architectures and a range of standard benchmark datasets for NLP, automatic speech recognition (ASR) and computer vision tasks to show that these architectures, with the same hyper-parameter settings as reported in the literature, perform comparably or better when trained with the square loss, even after equalizing computational resources.\nIndeed, we observe that the square loss produces better results in the dominant majority of NLP and ASR experiments. Cross-entropy appears to have a slight edge on computer vision tasks.\n\nWe argue that there is little compelling empirical or theoretical evidence indicating a clear-cut advantage to the cross-entropy loss. Indeed, in our experiments, performance on nearly all non-vision tasks  can be improved, sometimes significantly, by switching to the square loss. Furthermore, training with square loss appears to be less sensitive to the randomness in initialization. We posit that\ntraining using the square loss for classification needs to be a part of best practices of modern deep learning on equal footing with cross-entropy. ",
+    "title": "EVALUATION OF NEURAL ARCHITECTURES TRAINED WITH SQUARE LOSS VS CROSS-ENTROPY IN CLASSIFICATION TASKS",
+    "authors": [
+      "Like Hui",
+      "Mikhail Belkin"
+    ],
+    "emails": [
+      "~Like_Hui1",
+      "ucsd.edu"
+    ],
+    "rank": 139
+  },
+  {
+    "url": "https://openreview.net/forum?id=5m3SEczOV8L",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      6,
+      8
+    ],
+    "rating": "7.06",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Energy-based models (EBMs) have recently been successful in representing complex distributions of small images. However, sampling from them requires expensive Markov chain Monte Carlo (MCMC) iterations that mix slowly in high dimensional pixel space. Unlike EBMs, variational autoencoders (VAEs) generate samples quickly and are equipped with a latent space that enables fast traversal of the data manifold. However, VAEs tend to assign high probability density to regions in data space outside the actual data distribution and often fail at generating sharp images. In this paper, we propose VAEBM, a symbiotic composition of a VAE and an EBM that offers the best of both worlds. VAEBM captures the overall mode structure of the data distribution using a state-of-the-art VAE and it relies on its EBM component to explicitly exclude non-data-like regions from the model and refine the image samples. Moreover, the VAE component in VAEBM allows us to speed up MCMC updates by reparameterizing them in the VAE's latent space. Our experimental results show that VAEBM outperforms state-of-the-art VAEs and EBMs in generative quality on several benchmark image datasets by a large margin. It can generate high-quality images as large as 256<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>256 pixels with short MCMC chains. We also demonstrate that VAEBM provides complete mode coverage and performs well in out-of-distribution detection. ",
+    "title": "VAEBM: A Symbiosis between Variational Autoencoders and Energy-based Models",
+    "authors": [
+      "Zhisheng Xiao",
+      "Karsten Kreis",
+      "Jan Kautz",
+      "Arash Vahdat"
+    ],
+    "emails": [
+      "~Karsten_Kreis1",
+      "~Zhisheng_Xiao1",
+      "~Arash_Vahdat3",
+      "~Jan_Kautz1"
+    ],
+    "rank": 140
+  },
+  {
+    "url": "https://openreview.net/forum?id=zWy1uxjDdZJ",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      6,
+      7
+    ],
+    "rating": "7.06",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Local robustness ensures that a model classifies all inputs within an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container>-ball consistently, which precludes various forms of adversarial inputs.\nIn this paper, we present a fast procedure for checking local robustness in feed-forward neural networks with piecewise-linear activation functions.\nSuch networks partition the input space into a set of convex polyhedral regions in which the network\u2019s behavior is linear; \nhence, a systematic search for decision boundaries within the regions around a given input is sufficient for assessing robustness.\nCrucially, we show how the regions around a point can be analyzed using simple geometric projections, thus admitting an efficient, highly-parallel GPU implementation that excels particularly for the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> norm, where previous work has been less effective.\nEmpirically we find this approach to be far more precise than many approximate verification approaches, while at the same time performing multiple orders of magnitude faster than complete verifiers, and scaling to much deeper networks.",
+    "title": "Fast Geometric Projections for Local Robustness Certification",
+    "authors": [
+      "Aymeric Fromherz",
+      "Klas Leino",
+      "Matt Fredrikson",
+      "Bryan Parno",
+      "Corina Pasareanu"
+    ],
+    "emails": [
+      "~Klas_Leino1",
+      "~Matt_Fredrikson1",
+      "~Aymeric_Fromherz1",
+      "~Corina_Pasareanu1",
+      "~Bryan_Parno1"
+    ],
+    "rank": 141
+  },
+  {
+    "url": "https://openreview.net/forum?id=b9PoimzZFJ",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      8,
+      8
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We consider situations where the presence of dominant simpler correlations with the target variable in a training set can cause an SGD-trained neural network to be less reliant on more persistently correlating complex features. When the non-persistent, simpler correlations correspond to non-semantic background factors, a neural network trained on this data can exhibit dramatic failure upon encountering systematic distributional shift, where the correlating background features are recombined with different objects. We perform an empirical study on three synthetic datasets, showing that group invariance methods across inferred partitionings of the training set can lead to significant improvements at such test-time situations. We also suggest a simple invariance penalty, showing with experiments on our setups that it can perform better than alternatives. We find that even without assuming access to any systematically shifted validation sets, one can still find improvements over an ERM-trained reference model.",
+    "title": "Systematic generalisation with group invariant predictions",
+    "authors": [
+      "Faruk Ahmed",
+      "Yoshua Bengio",
+      "Harm van Seijen",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Faruk_Ahmed1",
+      "~Aaron_Courville3",
+      "~Harm_van_Seijen1",
+      "~Yoshua_Bengio1"
+    ],
+    "rank": 142
+  },
+  {
+    "url": "https://openreview.net/forum?id=O7ms4LFdsX",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Learning disentangled representations leads to interpretable models and facilitates data generation with style transfer, which has been extensively studied on static data such as images in an unsupervised learning framework. However, only a few works have explored unsupervised disentangled sequential representation learning due to challenges of generating sequential data. In this paper, we propose recurrent Wasserstein Autoencoder (R-WAE), a new framework for generative modeling of sequential data. R-WAE disentangles the representation of an input sequence into static and dynamic factors (i.e., time-invariant and time-varying parts). Our theoretical analysis shows that, R-WAE minimizes an upper bound of a penalized form of the Wasserstein distance between model distribution and sequential data distribution, and simultaneously maximizes the mutual information between input data and different disentangled latent factors, respectively. This is superior to (recurrent) VAE which does not explicitly enforce mutual information maximization between input data and disentangled latent representations. When the number of actions in sequential data is available as weak supervision information, R-WAE is extended to learn a categorical latent representation of actions to improve its disentanglement. Experiments on a variety of datasets show that our models outperform other baselines with the same settings in terms of disentanglement and unconditional video generation both quantitatively and qualitatively.",
+    "title": "Disentangled Recurrent Wasserstein Autoencoder ",
+    "authors": [
+      "Jun Han",
+      "Martin Renqiang Min",
+      "Ligong Han",
+      "Li Erran Li",
+      "Xuan Zhang"
+    ],
+    "emails": [
+      "~Martin_Renqiang_Min1",
+      "~Jun_Han4",
+      "~Ligong_Han1",
+      "~Xuan_Zhang3",
+      "~Li_Erran_Li1"
+    ],
+    "rank": 143
+  },
+  {
+    "url": "https://openreview.net/forum?id=FOyuZ26emy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Subspace clustering is an unsupervised clustering technique designed to cluster data that is supported on a union of linear subspaces, with each subspace defining a cluster with dimension lower than the ambient space. Many existing formulations for this problem are based on exploiting the self-expressive property of linear subspaces, where any point within a subspace can be represented as linear combination of other points within the subspace. To extend this approach to data supported on a union of non-linear manifolds, numerous studies have proposed learning an embedding of the original data using  a neural network which is regularized by a self-expressive loss function on the data in the embedded space to encourage a union of linear subspaces prior on the data in the embedded space. Here we show that there are a number of potential flaws with this approach which have not been adequately addressed in prior work. In particular, we show the model formulation is often ill-posed in that it can lead to a degenerate embedding of the data, which need not correspond to a union of subspaces at all and is poorly suited for clustering. We validate our theoretical results experimentally and also repeat prior experiments reported in the literature, where we conclude that a significant portion of the previously claimed performance benefits can be attributed to an ad-hoc post processing step rather than the deep subspace clustering model.",
+    "title": "A Critique of Self-Expressive Deep Subspace Clustering",
+    "authors": [
+      "Benjamin David Haeffele",
+      "Chong You",
+      "Rene Vidal"
+    ],
+    "emails": [
+      "~Rene_Vidal1",
+      "~Chong_You2",
+      "~Benjamin_David_Haeffele1"
+    ],
+    "rank": 144
+  },
+  {
+    "url": "https://openreview.net/forum?id=27acGyyI1BY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      2,
+      3,
+      4
+    ],
+    "abstract": "Neural Ordinary Differential Equations (NODEs) use a neural network to model the instantaneous rate of change in the state of a system. However, despite their apparent suitability for dynamics-governed time-series, NODEs present a few disadvantages. First, they are unable to adapt to incoming data-points, a fundamental requirement for real-time applications imposed by the natural direction of time. Second, time-series are often composed of a sparse set of measurements that could be explained by many possible underlying dynamics. NODEs do not capture this uncertainty. In contrast, Neural Processes (NPs) are a new class of stochastic processes providing uncertainty estimation and fast data-adaptation, but lack an explicit treatment of the flow of time. To address these problems, we introduce Neural ODE Processes (NDPs), a new class of stochastic processes determined by a distribution over Neural ODEs. By maintaining an adaptive data-dependent distribution over the underlying ODE, we show that our model can successfully capture the dynamics of low-dimensional systems from just a few data-points. At the same time, we demonstrate that NDPs scale up to challenging high-dimensional time-series with unknown latent dynamics such as rotating MNIST digits. ",
+    "title": "Neural ODE Processes",
+    "authors": [
+      "Alexander Norcliffe",
+      "Cristian Bodnar",
+      "Ben Day",
+      "Jacob Moss",
+      "Pietro Li\u00f2"
+    ],
+    "emails": [
+      "cam.ac.uk",
+      "~Pietro_Li\u00f21",
+      "~Ben_Day1",
+      "~Cristian_Bodnar1",
+      "gmail.com"
+    ],
+    "rank": 145
+  },
+  {
+    "url": "https://openreview.net/forum?id=KUDUoRsEphu",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Fast and stable fluid simulations are an essential prerequisite for applications ranging from computer-generated imagery to computer-aided design in research and development. However, solving the partial differential equations of incompressible fluids is a challenging task and traditional numerical approximation schemes come at high computational costs. Recent deep learning based approaches promise vast speed-ups but do not generalize to new fluid domains, require fluid simulation data for training, or rely on complex pipelines that outsource major parts of the fluid simulation to traditional methods.\n\nIn this work, we propose a novel physics-constrained training approach that generalizes to new fluid domains, requires no fluid simulation data, and allows convolutional neural networks to map a fluid state from time-point t to a subsequent state at time t+dt in a single forward pass. This simplifies the pipeline to train and evaluate neural fluid models. After training, the framework yields models that are capable of fast fluid simulations and can handle various fluid phenomena including the Magnus effect and K\u00e1rm\u00e1n vortex streets. We present an interactive real-time demo to show the speed and generalization capabilities of our trained models. Moreover, the trained neural networks are efficient differentiable fluid solvers as they offer a differentiable update step to advance the fluid simulation in time. We exploit this fact in a proof-of-concept optimal control experiment. Our models significantly outperform a recent differentiable fluid solver in terms of computational speed and accuracy.",
+    "title": "Learning Incompressible Fluid Dynamics from Scratch - Towards Fast, Differentiable Fluid Models that Generalize",
+    "authors": [
+      "Nils Wandel",
+      "Michael Weinmann",
+      "Reinhard Klein"
+    ],
+    "emails": [
+      "~Reinhard_Klein1",
+      "~Michael_Weinmann1",
+      "~Nils_Wandel2"
+    ],
+    "rank": 146
+  },
+  {
+    "url": "https://openreview.net/forum?id=iWLByfvUhN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this work, we propose a new generative model that is capable of automatically decoupling global and local representations of images in an entirely unsupervised setting, by embedding a generative flow in the VAE framework to model the decoder.\nSpecifically, the proposed model utilizes the variational auto-encoding framework to learn a (low-dimensional) vector of latent variables to capture the global information of an image, which is fed as a conditional input to a flow-based invertible decoder with architecture borrowed from style transfer literature.\nExperimental results on standard image benchmarks demonstrate the effectiveness of our model in terms of density estimation, image generation and unsupervised representation learning.\nImportantly, this work demonstrates that with only architectural inductive biases, a generative model with a likelihood-based objective is capable of learning decoupled representations, requiring no explicit supervision.\nThe code for our model is available at \\url{<a href=\"https://github.com/XuezheMax/wolf}\" target=\"_blank\" rel=\"nofollow\">https://github.com/XuezheMax/wolf}</a>.",
+    "title": "Decoupling Global and Local Representations via Invertible Generative Flows",
+    "authors": [
+      "Xuezhe Ma",
+      "Xiang Kong",
+      "Shanghang Zhang",
+      "Eduard H Hovy"
+    ],
+    "emails": [
+      "~Xiang_Kong1",
+      "~Xuezhe_Ma1",
+      "~Shanghang_Zhang4",
+      "~Eduard_H_Hovy1"
+    ],
+    "rank": 147
+  },
+  {
+    "url": "https://openreview.net/forum?id=zElset1Klrp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent work has shown that sparse representations---where only a small percentage of units are active---can significantly reduce interference. Those works, however, relied on relatively complex regularization or meta-learning approaches, that have only been used offline in a pre-training phase. In this work, we pursue a direction that achieves sparsity by design, rather than by learning. Specifically, we design an activation function that produces sparse representations deterministically by construction, and so is more amenable to online training. The idea relies on the simple approach of binning, but overcomes the two key limitations of binning: zero gradients for the flat regions almost everywhere,  and lost precision---reduced discrimination---due to coarse aggregation. We introduce a Fuzzy Tiling Activation (FTA) that provides non-negligible gradients and produces overlap between bins that improves discrimination. We first show that FTA is robust under covariate shift in a synthetic online supervised learning problem, where we can vary the level of correlation and drift. Then we move to the deep reinforcement learning setting and investigate both value-based and policy gradient algorithms that use neural networks with FTAs, in classic discrete control and Mujoco continuous control environments. We show that algorithms equipped with FTAs are able to learn a stable policy faster without needing target networks on most domains. ",
+    "title": "Fuzzy Tiling Activations: A Simple Approach to Learning Sparse Representations Online",
+    "authors": [
+      "Yangchen Pan",
+      "Kirby Banman",
+      "Martha White"
+    ],
+    "emails": [
+      "~Yangchen_Pan2",
+      "~Martha_White1",
+      "ualberta.ca"
+    ],
+    "rank": 148
+  },
+  {
+    "url": "https://openreview.net/forum?id=WAISmwsqDsb",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      2,
+      2
+    ],
+    "abstract": "Domain translation is the process of transforming data from one domain to another while preserving the common semantics. Some of the most popular domain translation systems are based on conditional generative adversarial networks, which use source domain data to drive the generator and as an input to the discriminator. However, this approach does not enforce the preservation of shared semantics since the conditional input can often be ignored by the discriminator. We propose an alternative method for conditioning and present a new framework, where two networks are simultaneously trained, in a supervised manner, to perform domain translation in opposite directions. Our method is not only better at capturing the shared information between two domains but is more generic and can be applied to a broader range of problems. The proposed framework performs well even in challenging cross-modal translations, such as video-driven speech reconstruction, for which other systems struggle to maintain correspondence.",
+    "title": "DINO: A Conditional Energy-Based GAN for Domain Translation",
+    "authors": [
+      "Konstantinos Vougioukas",
+      "Stavros Petridis",
+      "Maja Pantic"
+    ],
+    "emails": [
+      "~Konstantinos_Vougioukas1",
+      "~Maja_Pantic1",
+      "~Stavros_Petridis1"
+    ],
+    "rank": 149
+  },
+  {
+    "url": "https://openreview.net/forum?id=R4aWTjmrEKM",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      2
+    ],
+    "abstract": "Policy-Space Response Oracles (PSRO) is a general algorithmic framework for learning policies in multiagent systems by interleaving empirical game analysis with deep reinforcement learning (DRL).\nAt each iteration, DRL is invoked to train a best response to a mixture of opponent policies.\nThe repeated application of DRL poses an expensive computational burden as we look to apply this algorithm to more complex domains.\nWe introduce two variations of PSRO designed to reduce the amount of simulation required during DRL training.\nBoth algorithms modify how PSRO adds new policies to the empirical game, based on learned responses to a single opponent policy.\nThe first, Mixed-Oracles, transfers knowledge from previous iterations of DRL, requiring training only against the opponent's newest policy.\nThe second, Mixed-Opponents, constructs a pure-strategy opponent by mixing existing strategy's action-value estimates, instead of their policies.\nLearning against a single policy mitigates conflicting experiences on behalf of a learner facing an unobserved distribution of opponents.\nWe empirically demonstrate that these algorithms substantially reduce the amount of simulation during training required by PSRO, while producing equivalent or better solutions to the game.",
+    "title": "Iterative Empirical Game Solving via Single Policy Best Response",
+    "authors": [
+      "Max Smith",
+      "Thomas Anthony",
+      "Michael Wellman"
+    ],
+    "emails": [
+      "~Thomas_Anthony1",
+      "~Michael_Wellman1",
+      "~Max_Smith1"
+    ],
+    "rank": 150
+  },
+  {
+    "url": "https://openreview.net/forum?id=h2EbJ4_wMVq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Machine learning benefits from large training datasets, which may not always be possible to collect by any single entity, especially when using privacy-sensitive data. In many contexts, such as healthcare and finance, separate parties may wish to collaborate and learn from each other's data but are prevented from doing so due to privacy regulations. Some regulations prevent explicit sharing of data between parties by joining datasets in a central location (confidentiality). Others also limit implicit sharing of data, e.g., through model predictions (privacy). There is currently no method that enables machine learning in such a setting, where both confidentiality and privacy need to be preserved, to prevent both explicit and implicit sharing of data. Federated learning only provides confidentiality, not privacy, since gradients shared still contain private information. Differentially private learning assumes unreasonably large datasets. Furthermore, both of these learning paradigms produce a central model whose architecture was previously agreed upon by all parties rather than enabling collaborative learning where each party learns and improves their own local model. We introduce Confidential and Private Collaborative (CaPC) learning, the first method provably achieving both confidentiality and privacy in a collaborative setting. We leverage secure multi-party computation (MPC), homomorphic encryption (HE), and other techniques in combination with privately aggregated teacher models. We demonstrate how CaPC allows participants to collaborate without having to explicitly join their training sets or train a central model. Each party is able to improve the accuracy and fairness of their model, even in settings where each party has a model that performs well on their own dataset or when datasets are not IID and model architectures are heterogeneous across parties. ",
+    "title": "CaPC Learning: Confidential and Private Collaborative Learning",
+    "authors": [
+      "Christopher A. Choquette-Choo",
+      "Natalie Dullerud",
+      "Adam Dziedzic",
+      "Yunxiang Zhang",
+      "Somesh Jha",
+      "Nicolas Papernot",
+      "Xiao Wang"
+    ],
+    "emails": [
+      "~Yunxiang_Zhang1",
+      "~Natalie_Dullerud1",
+      "~Nicolas_Papernot1",
+      "~Somesh_Jha1",
+      "~Adam_Dziedzic1",
+      "~Christopher_A._Choquette-Choo1",
+      "~Xiao_Wang11"
+    ],
+    "rank": 151
+  },
+  {
+    "url": "https://openreview.net/forum?id=lqU2cs3Zca",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      8,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Signatory is a library for calculating and performing functionality related to the signature and logsignature transforms. The focus is on machine learning, and as such includes features such as CPU parallelism, GPU support, and backpropagation. To our knowledge it is the first GPU-capable library for these operations. Signatory implements new features not available in previous libraries, such as efficient precomputation strategies. Furthermore, several novel algorithmic improvements are introduced, producing substantial real-world speedups even on the CPU without parallelism. The library operates as a Python wrapper around C++, and is compatible with the PyTorch ecosystem. It may be installed directly via \\texttt{pip}. Source code, documentation, examples, benchmarks and tests may be found at \\texttt{\\url{<a href=\"https://github.com/patrick-kidger/signatory}}\" target=\"_blank\" rel=\"nofollow\">https://github.com/patrick-kidger/signatory}}</a>. The license is Apache-2.0.",
+    "title": "Signatory: differentiable computations of the signature and logsignature transforms, on both CPU and GPU",
+    "authors": [
+      "Patrick Kidger",
+      "Terry Lyons"
+    ],
+    "emails": [
+      "maths.ox.ac.uk",
+      "~Patrick_Kidger1"
+    ],
+    "rank": 152
+  },
+  {
+    "url": "https://openreview.net/forum?id=Fmg_fQYUejf",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Continual (sequential) training and multitask (simultaneous) training are often attempting to solve the same overall objective: to find a solution that performs well on all considered tasks. The main difference is in the training regimes, where continual learning can only have access to one task at a time, which for neural networks typically leads to catastrophic forgetting. That is, the solution found for a subsequent task does not perform well on the previous ones anymore. \n    However, the relationship between the different minima that the two training regimes arrive at is not well understood. What sets them apart? Is there a local structure that could explain the difference in performance achieved by the two different schemes? \n    Motivated by recent work showing that different minima of the same task are typically connected by very simple curves of low error, we investigate whether multitask and continual solutions are similarly connected. We empirically find that indeed such connectivity can be reliably achieved and, more interestingly, it can be done by a linear path, conditioned on having the same initialization for both. We thoroughly analyze this observation and discuss its significance for the continual learning process.\n    Furthermore, we exploit this finding to propose an effective algorithm that constrains the sequentially learned minima to behave as the multitask solution.  We show that our method outperforms several state of the art continual learning algorithms on various vision benchmarks.",
+    "title": "Linear Mode Connectivity in Multitask and Continual Learning",
+    "authors": [
+      "Seyed Iman Mirzadeh",
+      "Mehrdad Farajtabar",
+      "Dilan Gorur",
+      "Razvan Pascanu",
+      "Hassan Ghasemzadeh"
+    ],
+    "emails": [
+      "~Razvan_Pascanu1",
+      "~Hassan_Ghasemzadeh1",
+      "~Dilan_Gorur1",
+      "~Mehrdad_Farajtabar1",
+      "wsu.edu"
+    ],
+    "rank": 153
+  },
+  {
+    "url": "https://openreview.net/forum?id=UcoXdfrORC",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "A generalist robot must be able to complete a variety of tasks in its environment. One appealing way to specify each task is in terms of a goal observation. However, learning goal-reaching policies with reinforcement learning remains a challenging problem, particularly when hand-engineered reward functions are not available. Learned dynamics models are a promising approach for learning about the environment without rewards or task-directed data, but planning to reach goals with such a model requires a notion of functional similarity between observations and goal states. We present a self-supervised method for model-based visual goal reaching, which uses both a visual dynamics model as well as a dynamical distance function learned using model-free reinforcement learning. Our approach learns entirely using offline, unlabeled data, making it practical to scale to large and diverse datasets. In our experiments, we find that our method can successfully learn models that perform a variety of tasks at test-time, moving objects amid distractors with a simulated robotic arm and even learning to open and close a drawer using a real-world robot. In comparisons, we find that this approach substantially outperforms both model-free and model-based prior methods.",
+    "title": "Model-Based Visual Planning with Self-Supervised Functional Distances",
+    "authors": [
+      "Stephen Tian",
+      "Suraj Nair",
+      "Frederik Ebert",
+      "Sudeep Dasari",
+      "Benjamin Eysenbach",
+      "Chelsea Finn",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Frederik_Ebert1",
+      "~Chelsea_Finn1",
+      "~Stephen_Tian1",
+      "~Suraj_Nair1",
+      "~Benjamin_Eysenbach1",
+      "~Sudeep_Dasari2",
+      "~Sergey_Levine1"
+    ],
+    "rank": 154
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oos98K9Lv-k",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      8,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recently, Neural Topic Models (NTMs) inspired by variational autoencoders have obtained increasingly research interest due to their promising results on text analysis. However, it is usually hard for existing NTMs to achieve good document representation and coherent/diverse topics at the same time. Moreover, they often degrade their performance severely on short documents. The requirement of reparameterisation could also comprise their training quality and model flexibility. To address these shortcomings, we present a new neural topic model via the theory of optimal transport (OT). Specifically, we propose to learn the topic distribution of a document by directly minimising its OT distance to the document's word distributions. Importantly, the cost matrix of the OT distance models the weights between topics and words, which is constructed by the distances between topics and words in an embedding space. Our proposed model can be trained efficiently with a differentiable loss. Extensive experiments show that our framework significantly outperforms the state-of-the-art NTMs on discovering more coherent and diverse topics and deriving better document representations for both regular and short texts.",
+    "title": "Neural Topic Model via Optimal Transport",
+    "authors": [
+      "He Zhao",
+      "Dinh Phung",
+      "Viet Huynh",
+      "Trung Le",
+      "Wray Buntine"
+    ],
+    "emails": [
+      "~Wray_Buntine1",
+      "~Dinh_Phung2",
+      "~He_Zhao1",
+      "~Viet_Huynh1",
+      "~Trung_Le2"
+    ],
+    "rank": 155
+  },
+  {
+    "url": "https://openreview.net/forum?id=MuSYkd1hxRP",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      8,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Recent state-of-the-art methods for neural architecture search (NAS) exploit gradient-based optimization by relaxing the problem into continuous optimization over architectures and shared-weights, a noisy process that remains poorly understood. We argue for the study of single-level empirical risk minimization to understand NAS with weight-sharing, reducing the design of NAS methods to devising optimizers and regularizers that can quickly obtain high-quality solutions to this problem. Invoking the theory of mirror descent, we present a geometry-aware framework that exploits the underlying structure of this optimization to return sparse architectural parameters, leading to simple yet novel algorithms that enjoy fast convergence guarantees and achieve state-of-the-art accuracy on the latest NAS benchmarks in computer vision. Notably, we exceed the best published results for both CIFAR and ImageNet on both the DARTS search space and NAS-Bench-201; on the latter we achieve near-oracle-optimal performance on CIFAR-10 and CIFAR-100. Together, our theory and experiments demonstrate a principled way to co-design optimizers and continuous relaxations of discrete NAS search spaces.",
+    "title": "Geometry-Aware Gradient Algorithms for Neural Architecture Search",
+    "authors": [
+      "Liam Li",
+      "Mikhail Khodak",
+      "Nina Balcan",
+      "Ameet Talwalkar"
+    ],
+    "emails": [
+      "~Liam_Li1",
+      "~Ameet_Talwalkar1",
+      "~Mikhail_Khodak1",
+      "~Nina_Balcan1"
+    ],
+    "rank": 156
+  },
+  {
+    "url": "https://openreview.net/forum?id=N3zUDGN5lO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Multitask Reinforcement Learning is a promising way to obtain models with better performance, generalisation, data efficiency, and robustness. Most existing work is limited to compatible settings, where the state and action space dimensions are the same across tasks. Graph Neural Networks (GNN) are one way to address incompatible environments, because they can process graphs of arbitrary size. They also allow practitioners to inject biases encoded in the structure of the input graph. Existing work in graph-based continuous control uses the physical morphology of the agent to construct the input graph, i.e., encoding limb features as node labels and using edges to connect the nodes if their corresponded limbs are physically connected.\nIn this work, we present a series of ablations on existing methods that show that morphological information encoded in the graph does not improve their performance. Motivated by the hypothesis that any benefits GNNs extract from the graph structure are outweighed by difficulties they create for message passing, we also propose Amorpheus, a transformer-based approach. Further results show that, while Amorpheus ignores the morphological information that GNNs encode, it nonetheless substantially outperforms GNN-based methods.",
+    "title": "My Body is a Cage: the Role of Morphology in Graph-Based Incompatible Control",
+    "authors": [
+      "Vitaly Kurin",
+      "Maximilian Igl",
+      "Tim Rockt\u00e4schel",
+      "Wendelin Boehmer",
+      "Shimon Whiteson"
+    ],
+    "emails": [
+      "~Shimon_Whiteson1",
+      "~Maximilian_Igl1",
+      "~Wendelin_Boehmer1",
+      "~Tim_Rockt\u00e4schel1",
+      "~Vitaly_Kurin1"
+    ],
+    "rank": 157
+  },
+  {
+    "url": "https://openreview.net/forum?id=eNdiU_DbM9",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Convolutional image classifiers can achieve high predictive accuracy, but quanti\u0002fying their uncertainty remains an unresolved challenge, hindering their deployment in consequential settings. Existing uncertainty quantification techniques, such as Platt scaling, attempt to calibrate the network\u2019s probability estimates, but they do not have formal guarantees. We present an algorithm that modifies any classifier to output a predictive set containing the true label with a user-specified probability, such as 90%. The algorithm is simple and fast like Platt scaling, but provides a formal finite-sample coverage guarantee for every model and dataset. Our method modifies an existing conformal prediction algorithm to give more sta\u0002ble predictive sets by regularizing the small scores of unlikely classes after Platt scaling. In experiments on both Imagenet and Imagenet-V2 with ResNet-152 and other classifiers, our scheme outperforms existing approaches, achieving coverage with sets that are often factors of 5 to 10 smaller than a stand-alone Platt scaling baseline.",
+    "title": "Uncertainty Sets for Image Classifiers using Conformal Prediction",
+    "authors": [
+      "Anastasios Nikolas Angelopoulos",
+      "Stephen Bates",
+      "Michael Jordan",
+      "Jitendra Malik"
+    ],
+    "emails": [
+      "~Michael_Jordan1",
+      "~Anastasios_Nikolas_Angelopoulos1",
+      "~Jitendra_Malik2",
+      "eecs.berkeley.edu"
+    ],
+    "rank": 158
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ec85b0tUwbA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Hyperbolic spaces, which have the capacity to embed tree structures without distortion owing to their exponential volume growth, have recently been applied to machine learning to better capture the hierarchical nature of data. In this study, we generalize the fundamental components of neural networks in a single hyperbolic geometry model, namely, the Poincar\u00e9 ball model. This novel methodology constructs a multinomial logistic regression, fully-connected layers, convolutional layers, and attention mechanisms under a unified mathematical interpretation, without increasing the parameters. Experiments show the superior parameter efficiency of our methods compared to conventional hyperbolic components, and stability and outperformance over their Euclidean counterparts.",
+    "title": "Hyperbolic Neural Networks++",
+    "authors": [
+      "Ryohei Shimizu",
+      "YUSUKE Mukuta",
+      "Tatsuya Harada"
+    ],
+    "emails": [
+      "~Ryohei_Shimizu1",
+      "~Tatsuya_Harada1",
+      "~YUSUKE_Mukuta1"
+    ],
+    "rank": 159
+  },
+  {
+    "url": "https://openreview.net/forum?id=YtMG5ex0ou",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We propose a new probabilistic method for unsupervised recovery of corrupted data. Given a large ensemble of degraded samples, our method recovers accurate posteriors of clean values, allowing the exploration of the manifold of possible reconstructed data and hence characterising the underlying uncertainty. In this set-ting, direct application of classical variational methods often gives rise to collapsed densities that do not adequately explore the solution space.  Instead, we derive our novel reduced entropy condition approximate inference method that results in rich posteriors.  We test our model in a data recovery task under the common setting of missing values and noise, demonstrating superior performance to existing variational methods for imputation and de-noising with different real data sets. We further show higher classification accuracy after imputation, proving the advantage of propagating uncertainty to downstream tasks with our model.",
+    "title": "Tomographic Auto-Encoder: Unsupervised Bayesian Recovery of Corrupted Data",
+    "authors": [
+      "Francesco Tonolini",
+      "Pablo Garcia Moreno",
+      "Andreas Damianou",
+      "Roderick Murray-Smith"
+    ],
+    "emails": [
+      "~Pablo_Garcia_Moreno1",
+      "~Andreas_Damianou1",
+      "~Francesco_Tonolini1",
+      "~Roderick_Murray-Smith1"
+    ],
+    "rank": 160
+  },
+  {
+    "url": "https://openreview.net/forum?id=MLSvqIHRidA",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Contrastive divergence (CD) learning is a classical method for fitting unnormalized statistical models to data samples. Despite its wide-spread use, the convergence properties of this algorithm are still not well understood. The main source of difficulty is an unjustified approximation which has been used to derive the gradient of the loss. In this paper, we present an alternative derivation of CD that does not require any approximation and sheds new light on the objective that is actually being optimized by the algorithm. Specifically, we show that CD is an adversarial learning procedure, where a discriminator attempts to classify whether a Markov chain generated from the model has been time-reversed. Thus, although predating generative adversarial networks (GANs) by more than a decade, CD is, in fact, closely related to these techniques. Our derivation settles well with previous observations, which have concluded that CD's update steps cannot be expressed as the gradients of any fixed objective function. In addition, as a byproduct, our derivation reveals a simple correction that can be used as an alternative to Metropolis-Hastings rejection, which is required when the underlying Markov chain is inexact (e.g., when using Langevin dynamics with a large step).",
+    "title": "Contrastive Divergence Learning is a Time Reversal Adversarial Game",
+    "authors": [
+      "Omer Yair",
+      "Tomer Michaeli"
+    ],
+    "emails": [
+      "~Omer_Yair1",
+      "~Tomer_Michaeli1"
+    ],
+    "rank": 161
+  },
+  {
+    "url": "https://openreview.net/forum?id=hvdKKV2yt7T",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "With increasingly more data and computation involved in their training,  machine learning models constitute valuable intellectual property. This has spurred interest in model stealing, which is made more practical by advances in learning with partial, little, or no supervision. Existing defenses focus on inserting unique watermarks in a model's decision surface, but this is insufficient:  the watermarks are not sampled from the training distribution and thus are not always preserved during model stealing. In this paper, we make the key observation that knowledge contained in the stolen model's training set is what is common to all stolen copies. The adversary's goal, irrespective of the attack employed, is always to extract this knowledge or its by-products. This gives the original model's owner a strong advantage over the adversary: model owners have access to the original training data. We thus introduce <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">dataset inference</mtext></math></mjx-assistive-mml></mjx-container>, the process of identifying whether a suspected model copy has private knowledge from the original model's dataset, as a defense against model stealing. We develop an approach for dataset inference that combines statistical testing with the ability to estimate the distance of multiple data points to the decision boundary. Our experiments on CIFAR10, SVHN, CIFAR100 and ImageNet show that model owners can claim with confidence greater than 99% that their model (or dataset as a matter of fact) was stolen, despite only exposing 50 of the stolen model's training points. Dataset inference defends against state-of-the-art attacks even when the adversary is adaptive. Unlike prior work, it does not require retraining or overfitting the defended model.",
+    "title": "Dataset Inference: Ownership Resolution in Machine Learning",
+    "authors": [
+      "Pratyush Maini",
+      "Mohammad Yaghini",
+      "Nicolas Papernot"
+    ],
+    "emails": [
+      "~Mohammad_Yaghini1",
+      "~Pratyush_Maini1",
+      "~Nicolas_Papernot1"
+    ],
+    "rank": 162
+  },
+  {
+    "url": "https://openreview.net/forum?id=JBAa9we1AL",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "We consider the task of enforcing individual fairness in gradient boosting. Gradient boosting is a popular method for machine learning from tabular data, which arise often in applications where algorithmic fairness is a concern. At a high level, our approach is a functional gradient descent on a (distributionally) robust loss function that encodes our intuition of algorithmic fairness for the ML task at hand. Unlike prior approaches to individual fairness that only work with smooth ML models, our approach also works with non-smooth models such as decision trees. We show that our algorithm converges globally and generalizes. We also demonstrate the efficacy of our algorithm on three ML problems susceptible to algorithmic bias.",
+    "title": "Individually Fair Gradient Boosting",
+    "authors": [
+      "Alexander Vargo",
+      "Fan Zhang",
+      "Mikhail Yurochkin",
+      "Yuekai Sun"
+    ],
+    "emails": [
+      "~Yuekai_Sun1",
+      "umich.edu",
+      "~Mikhail_Yurochkin1",
+      "shanghaitech.edu.cn"
+    ],
+    "rank": 163
+  },
+  {
+    "url": "https://openreview.net/forum?id=YmqAnY0CMEy",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We demonstrate that self-supervised language modeling applied to mathematical formulas enables logical reasoning. To measure the logical reasoning abilities of language models, we formulate several evaluation (downstream) tasks, such as inferring types, suggesting missing assumptions and completing equalities. For training language models for formal mathematics, we propose a novel skip-tree task. We find that models trained on the skip-tree task show surprisingly strong mathematical reasoning abilities, and outperform models trained on standard skip-sequence tasks. We also analyze the models' ability to formulate new conjectures by measuring how often the predictions are provable and useful in other proofs.",
+    "title": "Mathematical Reasoning via Self-supervised Skip-tree Training",
+    "authors": [
+      "Markus Norman Rabe",
+      "Dennis Lee",
+      "Kshitij Bansal",
+      "Christian Szegedy"
+    ],
+    "emails": [
+      "~Dennis_Lee1",
+      "~Markus_Norman_Rabe1",
+      "~Christian_Szegedy1",
+      "~Kshitij_Bansal1"
+    ],
+    "rank": 164
+  },
+  {
+    "url": "https://openreview.net/forum?id=xYGNO86OWDH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "The geometric properties of contextual embedding spaces for deep language models such as BERT and ERNIE, have attracted considerable attention in recent years. Investigations on the contextual embeddings demonstrate a strong anisotropic space such that most of the vectors fall within a narrow cone, leading to high cosine similarities.  It is surprising that these LMs are as successful as they are, given that most of their embedding vectors are as similar to one another as they are. In this paper, we argue that the isotropy indeed exists in the space, from a different but more constructive perspective. We identify isolated clusters and low dimensional manifolds in the contextual embedding space, and introduce tools to both qualitatively and quantitatively analyze them. We hope the study in this paper could provide insights towards a better understanding of the deep language models.",
+    "title": "Isotropy in the Contextual Embedding Space: Clusters and Manifolds",
+    "authors": [
+      "Xingyu Cai",
+      "Jiaji Huang",
+      "Yuchen Bian",
+      "Kenneth Church"
+    ],
+    "emails": [
+      "~Xingyu_Cai1",
+      "~Jiaji_Huang1",
+      "~Yuchen_Bian1",
+      "~Kenneth_Church1"
+    ],
+    "rank": 165
+  },
+  {
+    "url": "https://openreview.net/forum?id=YicbFdNTTy",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.",
+    "title": "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
+    "authors": [
+      "Alexey Dosovitskiy",
+      "Lucas Beyer",
+      "Alexander Kolesnikov",
+      "Dirk Weissenborn",
+      "Xiaohua Zhai",
+      "Thomas Unterthiner",
+      "Mostafa Dehghani",
+      "Matthias Minderer",
+      "Georg Heigold",
+      "Sylvain Gelly",
+      "Jakob Uszkoreit",
+      "Neil Houlsby"
+    ],
+    "emails": [
+      "~Thomas_Unterthiner1",
+      "~Xiaohua_Zhai2",
+      "~Mostafa_Dehghani1",
+      "~Lucas_Beyer1",
+      "~Jakob_Uszkoreit1",
+      "~Alexander_Kolesnikov2",
+      "~Georg_Heigold1",
+      "~Matthias_Minderer1",
+      "~Neil_Houlsby1",
+      "~Sylvain_Gelly1",
+      "~Alexey_Dosovitskiy1",
+      "~Dirk_Weissenborn1"
+    ],
+    "rank": 166
+  },
+  {
+    "url": "https://openreview.net/forum?id=JWOiYxMG92s",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Learning from a limited number of samples is challenging since the learned model can easily become overfitted based on the biased distribution formed by only a few training examples. In this paper, we calibrate the distribution of these few-sample classes by transferring statistics from the classes with sufficient examples. Then an adequate number of examples can be sampled from the calibrated distribution to expand the inputs to the classifier. We assume every dimension in the feature representation follows a Gaussian distribution so that the mean and the variance of the distribution can borrow from that of similar classes whose statistics are better estimated with an adequate number of samples. Our method can be built on top of off-the-shelf pretrained feature extractors and classification models without extra parameters. We show that a simple logistic regression classifier trained using the features sampled from our calibrated distribution can outperform the state-of-the-art accuracy on three datasets (~5% improvement on miniImageNet compared to the next best). The visualization of these generated features demonstrates that our calibrated distribution is an accurate estimation. ",
+    "title": "Free Lunch for Few-shot Learning:  Distribution Calibration",
+    "authors": [
+      "Shuo Yang",
+      "Lu Liu",
+      "Min Xu"
+    ],
+    "emails": [
+      "~Shuo_Yang5",
+      "~Lu_Liu7",
+      "~Min_Xu5"
+    ],
+    "rank": 167
+  },
+  {
+    "url": "https://openreview.net/forum?id=bB2drc7DPuB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We study the problem of policy optimization for infinite-horizon discounted Markov Decision Processes with softmax policy and nonlinear function approximation trained with policy gradient algorithms. We concentrate on the training dynamics in the mean-field regime, modeling e.g. the behavior of wide single hidden layer neural networks, when exploration is encouraged through entropy regularization. The dynamics of these models is established as a Wasserstein gradient flow of distributions in parameter space.  We further prove global optimality of the fixed points of this dynamics  under mild conditions on their initialization.",
+    "title": "Global optimality of softmax policy gradient with single hidden layer neural networks in the mean-field regime",
+    "authors": [
+      "Andrea Agazzi",
+      "Jianfeng Lu"
+    ],
+    "emails": [
+      "~Andrea_Agazzi2",
+      "~Jianfeng_Lu1"
+    ],
+    "rank": 168
+  },
+  {
+    "url": "https://openreview.net/forum?id=GY6-6sTvGaf",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "We propose a simple data augmentation technique that can be applied to standard model-free reinforcement learning algorithms, enabling robust learning directly from pixels without the need for auxiliary losses or pre-training.  The approach leverages input perturbations commonly used in computer vision tasks to transform input examples, as well as regularizing the value function and policy.  Existing model-free approaches, such as Soft Actor-Critic (SAC), are not able to train deep networks effectively from image pixels. However, the addition of our augmentation method dramatically improves SAC\u2019s performance, enabling it to reach state-of-the-art performance on the DeepMind control suite, surpassing model-based (Hafner et al., 2019; Lee et al., 2019; Hafner et al., 2018) methods and recently proposed contrastive learning (Srinivas et al., 2020).  Our approach, which we dub DrQ: Data-regularized Q, can be combined with any model-free reinforcement learning algorithm. We further demonstrate this by applying it to DQN and significantly improve its data-efficiency on the Atari 100k benchmark.",
+    "title": "Image Augmentation Is All You Need: Regularizing Deep Reinforcement Learning from Pixels",
+    "authors": [
+      "Denis Yarats",
+      "Ilya Kostrikov",
+      "Rob Fergus"
+    ],
+    "emails": [
+      "~Denis_Yarats1",
+      "~Ilya_Kostrikov1",
+      "~Rob_Fergus1"
+    ],
+    "rank": 169
+  },
+  {
+    "url": "https://openreview.net/forum?id=84gjULz1t5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Communication compression has become a key strategy to speed up distributed optimization. However, existing decentralized algorithms with compression mainly focus on compressing DGD-type algorithms. They are unsatisfactory in terms of convergence rate, stability, and the capability to handle heterogeneous data. Motivated by primal-dual algorithms, this paper proposes the first \\underline{L}in\\underline{EA}r convergent \\underline{D}ecentralized algorithm with compression, LEAD. Our theory describes the coupled dynamics of the inexact primal and dual update as well as compression error, and we provide the first consensus error bound in such settings without assuming bounded gradients. Experiments on convex problems validate our theoretical analysis, and empirical study on deep neural nets shows that LEAD is applicable to non-convex problems.",
+    "title": "Linear Convergent Decentralized Optimization with Compression",
+    "authors": [
+      "Xiaorui Liu",
+      "Yao Li",
+      "Rongrong Wang",
+      "Jiliang Tang",
+      "Ming Yan"
+    ],
+    "emails": [
+      "~Xiaorui_Liu1",
+      "~Jiliang_Tang1",
+      "msu.edu"
+    ],
+    "rank": 170
+  },
+  {
+    "url": "https://openreview.net/forum?id=umIdUL8rMH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Model Agnostic Meta-Learning (MAML) is one of the most representative of gradient-based meta-learning algorithms. MAML learns new tasks with a few data samples using inner updates from a meta-initialization point and learns the meta-initialization parameters with outer updates. It has recently been hypothesized that representation reuse, which makes little change in efficient representations, is the dominant factor in the performance of the meta-initialized model through MAML in contrast to representation change, which causes a significant change in representations. In this study, we investigate the necessity of representation change for the ultimate goal of few-shot learning, which is solving domain-agnostic tasks. To this aim, we propose a novel meta-learning algorithm, called BOIL (Body Only update in Inner Loop), which updates only the body (extractor) of the model and freezes the head (classifier) during inner loop updates. BOIL leverages representation change rather than representation reuse. A frozen head cannot achieve better results than even a random guessing classifier at the initial point of new tasks, and feature vectors (representations) have to move quickly to their corresponding frozen head vectors. We visualize this property using cosine similarity, CKA, and empirical results without the head. Although the inner loop updates purely hinge on representation change, BOIL empirically shows significant performance improvement over MAML, particularly on cross-domain tasks. The results imply that representation change in gradient-based meta-learning approaches is a critical component.",
+    "title": "BOIL: Towards Representation Change for Few-shot Learning",
+    "authors": [
+      "Jaehoon Oh",
+      "Hyungjun Yoo",
+      "ChangHwan Kim",
+      "Se-Young Yun"
+    ],
+    "emails": [
+      "~Se-Young_Yun1",
+      "~Hyungjun_Yoo1",
+      "~Jaehoon_Oh1",
+      "~ChangHwan_Kim2"
+    ],
+    "rank": 171
+  },
+  {
+    "url": "https://openreview.net/forum?id=mNtmhaDkAr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      8
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      2,
+      4
+    ],
+    "abstract": "Most current NLP systems are based on a pre-train-then-fine-tune paradigm, in which a large neural network is first trained in a self-supervised way designed to encourage the network to extract broadly-useful linguistic features, and then fine-tuned for a specific task of interest. Recent work attempts to understand why this recipe works and explain when it fails. Currently, such analyses have produced two sets of apparently-contradictory results. Work that analyzes the representations that result from pre-training (via \"probing classifiers\") finds evidence that rich features of linguistic structure can be decoded with high accuracy, but work that analyzes model behavior after fine-tuning (via \"challenge sets\") indicates that decisions are often not based on such structure but rather on spurious heuristics specific to the training set. In this work, we test the hypothesis that the extent to which a feature influences a model's decisions can be predicted using a combination of two factors: The feature's \"extractability\" after pre-training (measured using information-theoretic probing techniques), and the \"evidence\" available during fine-tuning (defined as the feature's co-occurrence rate with the label). In experiments with both synthetic and natural language data, we find strong evidence (statistically significant correlations) supporting this hypothesis.",
+    "title": "Predicting Inductive Biases of Pre-Trained Models",
+    "authors": [
+      "Charles Lovering",
+      "Rohan Jha",
+      "Tal Linzen",
+      "Ellie Pavlick"
+    ],
+    "emails": [
+      "brown.edu",
+      "~Tal_Linzen1",
+      "~Charles_Lovering1",
+      "~Ellie_Pavlick1"
+    ],
+    "rank": 172
+  },
+  {
+    "url": "https://openreview.net/forum?id=eQe8DEWNN2W",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      8,
+      5,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Calibrating neural networks is of utmost importance when employing them in safety-critical applications where the downstream decision making depends on the predicted probabilities. Measuring calibration error amounts to comparing two empirical distributions. In this work, we introduce a binning-free calibration measure inspired by the classical Kolmogorov-Smirnov (KS) statistical test in which the main idea is to compare the respective cumulative probability distributions. From this, by approximating the empirical cumulative distribution using a differentiable function via splines, we obtain a recalibration function, which maps the network outputs to actual (calibrated) class assignment probabilities. The spline-fitting is performed using a held-out calibration set and the obtained recalibration function is evaluated on an unseen test set. We tested our method against existing calibration approaches on various image classification datasets and our spline-based recalibration approach consistently outperforms existing methods on KS error as well as other commonly used calibration measures. Code is available online at <a href=\"https://github.com/kartikgupta-at-anu/spline-calibration\" target=\"_blank\" rel=\"nofollow\">https://github.com/kartikgupta-at-anu/spline-calibration</a>.",
+    "title": "Calibration of Neural Networks using Splines",
+    "authors": [
+      "Kartik Gupta",
+      "Amir Rahimi",
+      "Thalaiyasingam Ajanthan",
+      "Thomas Mensink",
+      "Cristian Sminchisescu",
+      "Richard Hartley"
+    ],
+    "emails": [
+      "~Kartik_Gupta2",
+      "~Amir_Rahimi1",
+      "~Thalaiyasingam_Ajanthan1",
+      "~Thomas_Mensink1",
+      "~Cristian_Sminchisescu1",
+      "~Richard_Hartley1"
+    ],
+    "rank": 173
+  },
+  {
+    "url": "https://openreview.net/forum?id=Pzj6fzU6wkj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      9,
+      7,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "A well-defined benchmark is essential for measuring and accelerating research progress of machine learning models. In this paper, we present a benchmark for high-level mathematical reasoning and study the reasoning capabilities of neural sequence-to-sequence models. We build a non-synthetic dataset from the largest repository of proofs written by human experts in a theorem prover. The dataset has a broad coverage of undergraduate and research-level mathematical and computer science theorems. In our defined task, a model is required to fill in a missing intermediate proposition given surrounding proofs. This task provides a starting point for the long-term goal of having machines generate human-readable proofs automatically. Our experiments and analysis reveal that while the task is challenging, neural models can capture non-trivial mathematical reasoning. We further design a hierarchical transformer that outperforms the transformer baseline. ",
+    "title": "IsarStep: a Benchmark for High-level Mathematical Reasoning",
+    "authors": [
+      "Wenda Li",
+      "Lei Yu",
+      "Yuhuai Wu",
+      "Lawrence C. Paulson"
+    ],
+    "emails": [
+      "cam.ac.uk",
+      "~Lei_Yu4",
+      "~Wenda_Li1",
+      "~Yuhuai_Wu1"
+    ],
+    "rank": 174
+  },
+  {
+    "url": "https://openreview.net/forum?id=ks5nebunVn_",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Robustness against word substitutions has a well-defined and widely acceptable form, i.e., using semantically similar words as substitutions, and thus it is considered as a fundamental stepping-stone towards broader robustness in natural language processing. Previous defense methods capture word substitutions in vector space by using either l_2-ball or hyper-rectangle, which results in perturbation sets that are not inclusive enough or unnecessarily large, and thus impedes mimicry of worst cases for robust training. In this paper, we introduce a novel Adversarial Sparse Convex Combination (ASCC) method. We model the word substitution attack space as a convex hull and leverages a regularization term to enforce perturbation towards an actual substitution, thus aligning our modeling better with the discrete textual space. Based on  ASCC method, we further propose ASCC-defense, which leverages ASCC to generate worst-case perturbations and incorporates adversarial training towards robustness. Experiments show that ASCC-defense outperforms the current state-of-the-arts in terms of robustness on two prevailing NLP tasks, i.e., sentiment analysis and natural language inference, concerning several attacks across multiple model architectures. Besides, we also envision a new class of defense towards robustness in NLP, where our robustly trained word vectors can be plugged into a normally trained model and enforce its robustness without applying any other defense techniques.",
+    "title": "Towards Robustness Against Natural Language Word Substitutions",
+    "authors": [
+      "Xinshuai Dong",
+      "Anh Tuan Luu",
+      "Rongrong Ji",
+      "Hong Liu"
+    ],
+    "emails": [
+      "~Xinshuai_Dong1",
+      "~Anh_Tuan_Luu2",
+      "~Rongrong_Ji5",
+      "~Hong_Liu9"
+    ],
+    "rank": 175
+  },
+  {
+    "url": "https://openreview.net/forum?id=_WnwtieRHxM",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      1,
+      4,
+      4
+    ],
+    "abstract": "The recent paper by Byrd &amp; Lipton (2019), based on empirical observations, raises a major concern on the impact of importance weighting for the over-parameterized deep learning models. They observe that as long as the model can separate the training data, the impact of importance weighting diminishes as the training proceeds. Nevertheless, there lacks a rigorous characterization of this phenomenon. In this paper, we provide formal characterizations and theoretical justifications on the role of importance weighting with respect to the implicit bias of gradient descent and margin-based learning theory. We reveal both the optimization dynamics and generalization performance under deep learning models. Our work not only explains the various novel phenomenons observed for importance weighting in deep learning, but also extends to the studies where the weights are being optimized as part of the model, which applies to a number of topics under active research.",
+    "title": "Understanding the role of importance weighting for deep learning",
+    "authors": [
+      "Da Xu",
+      "Yuting Ye",
+      "Chuanwei Ruan"
+    ],
+    "emails": [
+      "~Chuanwei_Ruan1",
+      "~Da_Xu2",
+      "berkeley.edu"
+    ],
+    "rank": 176
+  },
+  {
+    "url": "https://openreview.net/forum?id=c_E8kFWfhp0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we tackle the problem of estimating object physical properties such as mass, friction, and elasticity directly from video sequences. Such a system identification problem is fundamentally ill-posed due to the loss of information during image formation. Current best solutions to the problem require precise 3D labels which are labor intensive to gather, and infeasible to create for many systems such as deformable solids or cloth. In this work we present gradSim, a framework that overcomes the dependence on 3D supervision by combining differentiable multiphysics simulation and differentiable rendering to jointly model the evolution of scene dynamics and image formation. This unique combination enables backpropagation from pixels in a video sequence through to the underlying physical attributes that generated them. Furthermore, our unified computation graph across dynamics and rendering engines enables the learning of challenging visuomotor control tasks, without relying on state-based (3D) supervision, while obtaining performance competitive to/better than techniques that require precise 3D labels.",
+    "title": "gradSim: Differentiable simulation for system identification and visuomotor control",
+    "authors": [
+      "J. Krishna Murthy",
+      "Miles Macklin",
+      "Florian Golemo",
+      "Vikram Voleti",
+      "Linda Petrini",
+      "Martin Weiss",
+      "Breandan Considine",
+      "J\u00e9r\u00f4me Parent-L\u00e9vesque",
+      "Kevin Xie",
+      "Kenny Erleben",
+      "Liam Paull",
+      "Florian Shkurti",
+      "Derek Nowrouzezahrai",
+      "Sanja Fidler"
+    ],
+    "emails": [
+      "~Miles_Macklin1",
+      "~Florian_Golemo1",
+      "~Derek_Nowrouzezahrai1",
+      "~Martin_Weiss4",
+      "~J._Krishna_Murthy1",
+      "~Vikram_Voleti1",
+      "~Linda_Petrini1",
+      "~Breandan_Considine2",
+      "~Liam_Paull1",
+      "~J\u00e9r\u00f4me_Parent-L\u00e9vesque2",
+      "~Sanja_Fidler1",
+      "~Florian_Shkurti1",
+      "di.ku.dk",
+      "cs.toronto.edu"
+    ],
+    "rank": 177
+  },
+  {
+    "url": "https://openreview.net/forum?id=0zvfm-nZqQs",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Knowledge Distillation (KD) is a widely used technique to transfer knowledge from pre-trained teacher models to  (usually more lightweight) student models. However, in certain situations, this technique is more of a curse than a blessing. For instance, KD poses a potential risk of exposing intellectual properties (IPs): even if a trained machine learning model is released in ``black boxes'' (e.g., as executable software or APIs without open-sourcing code), it can still be replicated by KD through imitating input-output behaviors. To prevent this unwanted effect of KD, this paper introduces and investigates a concept called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">Nasty Teacher</mtext></math></mjx-assistive-mml></mjx-container>: a specially trained teacher network that yields nearly the same performance as a normal one, but would significantly degrade the performance of student models learned by imitating it. We propose a simple yet effective algorithm to build the nasty teacher, called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">self-undermining knowledge distillation</mtext></math></mjx-assistive-mml></mjx-container>. Specifically, we aim to maximize the difference between the output of the nasty teacher and a normal pre-trained network. Extensive experiments on several datasets demonstrate that our method is effective on both standard KD and data-free KD, providing the desirable KD-immunity to model owners for the first time. We hope our preliminary study can draw more awareness and interest in this new practical problem of both social and legal importance. Our codes and pre-trained models can be found at: $\\url{<a href=\"https://github.com/VITA-Group/Nasty-Teacher}$\" target=\"_blank\" rel=\"nofollow\">https://github.com/VITA-Group/Nasty-Teacher}$</a>.",
+    "title": "Undistillable: Making A Nasty Teacher That CANNOT teach students",
+    "authors": [
+      "Haoyu Ma",
+      "Tianlong Chen",
+      "Ting-Kuei Hu",
+      "Chenyu You",
+      "Xiaohui Xie",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Ting-Kuei_Hu1",
+      "~Haoyu_Ma1",
+      "~Chenyu_You1",
+      "~Zhangyang_Wang1",
+      "~Tianlong_Chen1",
+      "~Xiaohui_Xie2"
+    ],
+    "rank": 178
+  },
+  {
+    "url": "https://openreview.net/forum?id=o_V-MjyyGV_",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In most real world scenarios, a policy trained by reinforcement learning in one environment needs to be deployed in another, potentially quite different environment. However, generalization across different environments is known to be hard. A natural solution would be to keep training after deployment in the new environment, but this cannot be done if the new environment offers no reward signal. Our work explores the use of self-supervision to allow the policy to continue training after deployment without using any rewards. While previous methods explicitly anticipate changes in the new environment, we assume no prior knowledge of those changes yet still obtain significant improvements. Empirical evaluations are performed on diverse simulation environments from DeepMind Control suite and ViZDoom, as well as real robotic manipulation tasks in  continuously changing environments, taking observations from an uncalibrated camera. Our method improves generalization in 31 out of 36 environments across various tasks and outperforms domain randomization on a majority of environments. Webpage and implementation: <a href=\"https://nicklashansen.github.io/PAD/\" target=\"_blank\" rel=\"nofollow\">https://nicklashansen.github.io/PAD/</a>.",
+    "title": "Self-Supervised Policy Adaptation during Deployment",
+    "authors": [
+      "Nicklas Hansen",
+      "Rishabh Jangir",
+      "Yu Sun",
+      "Guillem Aleny\u00e0",
+      "Pieter Abbeel",
+      "Alexei A Efros",
+      "Lerrel Pinto",
+      "Xiaolong Wang"
+    ],
+    "emails": [
+      "~Alexei_A_Efros1",
+      "~Lerrel_Pinto1",
+      "~Yu_Sun1",
+      "~Nicklas_Hansen1",
+      "~Guillem_Aleny\u00e01",
+      "gmail.com",
+      "~Pieter_Abbeel2",
+      "~Xiaolong_Wang3"
+    ],
+    "rank": 179
+  },
+  {
+    "url": "https://openreview.net/forum?id=zv-typ1gPxA",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Source code summarization aims to generate natural language summaries from structured code snippets for better understanding code functionalities. However, automatic code summarization is challenging due to the complexity of the source code and the language gap between the source code and natural language summaries. Most previous approaches either rely on retrieval-based (which can take advantage of similar examples seen from the retrieval database, but have low generalization performance) or generation-based methods (which have better generalization performance, but cannot take advantage of similar examples).\nThis paper proposes a novel retrieval-augmented mechanism to combine the benefits of both worlds.\nFurthermore, to mitigate the limitation of Graph Neural Networks (GNNs) on capturing global graph structure information of source code, we propose a novel attention-based dynamic graph to complement the static graph representation of the source code, and design a hybrid message passing GNN for capturing both the local and global structural information. To evaluate the proposed approach, we release a new challenging benchmark, crawled from diversified large-scale open-source C projects (total 95k+ unique functions in the dataset). Our method achieves the state-of-the-art performance, improving existing methods by 1.42, 2.44 and 1.29 in terms of BLEU-4, ROUGE-L and METEOR.",
+    "title": "Retrieval-Augmented Generation for Code Summarization via Hybrid GNN",
+    "authors": [
+      "Shangqing Liu",
+      "Yu Chen",
+      "Xiaofei Xie",
+      "Jing Kai Siow",
+      "Yang Liu"
+    ],
+    "emails": [
+      "~Yu_Chen5",
+      "~Xiaofei_Xie1",
+      "e.ntu.edu.sg",
+      "~Shangqing_Liu1",
+      "~Yang_Liu36"
+    ],
+    "rank": 180
+  },
+  {
+    "url": "https://openreview.net/forum?id=DktZb97_Fx",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      2,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we cast fair machine learning as invariant machine learning. We first formulate a version of individual fairness that enforces invariance on certain sensitive sets. We then design a transport-based regularizer that enforces this version of individual fairness and develop an algorithm to minimize the regularizer efficiently. Our theoretical results guarantee the proposed approach trains certifiably fair ML models. Finally, in the experimental studies we demonstrate improved fairness metrics in comparison to several recent fair training procedures on three ML tasks that are susceptible to algorithmic bias.",
+    "title": "SenSeI: Sensitive Set Invariance for Enforcing Individual Fairness",
+    "authors": [
+      "Mikhail Yurochkin",
+      "Yuekai Sun"
+    ],
+    "emails": [
+      "~Mikhail_Yurochkin1",
+      "~Yuekai_Sun1"
+    ],
+    "rank": 181
+  },
+  {
+    "url": "https://openreview.net/forum?id=V8jrrnwGbuc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Understanding how large neural networks avoid memorizing training data is key to explaining their high generalization performance. To examine the structure of when and where memorization occurs in a deep network, we use a recently developed replica-based mean field theoretic geometric analysis method. We find that all layers preferentially learn from examples which share features, and link this behavior to generalization performance. Memorization predominately occurs in the deeper layers, due to decreasing object manifolds\u2019 radius and dimension, whereas early layers are minimally affected. This predicts that generalization can be restored by reverting the final few layer weights to earlier epochs before significant memorization occurred, which is confirmed by the experiments. Additionally, by studying generalization under different model sizes, we reveal the connection between the double descent phenomenon and the underlying model geometry. Finally, analytical analysis shows that networks avoid memorization early in training because close to initialization, the gradient contribution from permuted examples are small. These findings provide quantitative evidence for the structure of memorization across layers of a deep neural network, the drivers for such structure, and its connection to manifold geometric properties.\n",
+    "title": "On the geometry of generalization and memorization in deep neural networks",
+    "authors": [
+      "Cory Stephenson",
+      "suchismita padhy",
+      "Abhinav Ganesh",
+      "Yue Hui",
+      "Hanlin Tang",
+      "SueYeon Chung"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~suchismita_padhy2",
+      "~Hanlin_Tang1",
+      "~SueYeon_Chung1",
+      "~Cory_Stephenson1",
+      "intel.com"
+    ],
+    "rank": 182
+  },
+  {
+    "url": "https://openreview.net/forum?id=_XYzwxPIQu6",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      6,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "A main theoretical interest in biology and physics is to identify the nonlinear dynamical system (DS) that generated observed time series. Recurrent Neural Networks (RNN) are, in principle, powerful enough to approximate any underlying DS, but in their vanilla form suffer from the exploding vs. vanishing gradients problem. Previous attempts to alleviate this problem resulted either in more complicated, mathematically less tractable RNN architectures, or strongly limited the dynamical expressiveness of the RNN. \nHere we address this issue by suggesting a simple regularization scheme for vanilla RNN with ReLU activation which enables them to solve long-range dependency problems and express slow time scales, while retaining a simple mathematical structure which makes their DS properties partly analytically accessible. We prove two theorems that establish a tight connection between the regularized RNN dynamics and their gradients, illustrate on DS benchmarks that our regularization approach strongly eases the reconstruction of DS which harbor widely differing time scales, and show that our method is also en par with other long-range architectures like LSTMs on several tasks.",
+    "title": "Identifying nonlinear dynamical systems with multiple time scales and long-range dependencies",
+    "authors": [
+      "Dominik Schmidt",
+      "Georgia Koppe",
+      "Zahra Monfared",
+      "Max Beutelspacher",
+      "Daniel Durstewitz"
+    ],
+    "emails": [
+      "~Daniel_Durstewitz1",
+      "~Georgia_Koppe1",
+      "zi-mannheim.de",
+      "~Dominik_Schmidt1",
+      "mailbox.org"
+    ],
+    "rank": 183
+  },
+  {
+    "url": "https://openreview.net/forum?id=EoFNy62JGd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "While training can mostly be accelerated by reducing the time needed to propagate neural gradients (loss gradients with respect to the intermediate neural layer outputs) back throughout the model, most previous works focus on the quantization/pruning of weights and activations. These methods are often not applicable to neural gradients, which have very different statistical properties. Distinguished from weights and activations, we find that the distribution of neural gradients is approximately lognormal. Considering this, we suggest two closed-form analytical methods to reduce the computational and memory burdens of neural gradients. The first method optimizes the floating-point format and scale of the gradients. The second method accurately sets sparsity thresholds for gradient pruning.  Each method achieves state-of-the-art results on ImageNet. To the best of our knowledge, this paper is the first to (1) quantize the gradients to 6-bit floating-point formats, or (2) achieve up to 85% gradient sparsity --- in each case without accuracy degradation.\nReference implementation accompanies the paper in the supplementary material.",
+    "title": "Neural gradients are near-lognormal: improved quantized  and sparse training",
+    "authors": [
+      "Brian Chmiel",
+      "Liad Ben-Uri",
+      "Moran Shkolnik",
+      "Elad Hoffer",
+      "Ron Banner",
+      "Daniel Soudry"
+    ],
+    "emails": [
+      "~Brian_Chmiel1",
+      "~Ron_Banner1",
+      "~Moran_Shkolnik1",
+      "gmail.com",
+      "~Daniel_Soudry1",
+      "~Elad_Hoffer1"
+    ],
+    "rank": 184
+  },
+  {
+    "url": "https://openreview.net/forum?id=HHSEKOnPvaO",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      8,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Despite significant advances, continual learning models still suffer from catastrophic forgetting when exposed to incrementally available data from non-stationary distributions. Rehearsal approaches alleviate the problem by maintaining and replaying a small episodic memory of previous samples, often implemented as an array of independent memory slots. In this work, we propose to augment such an array with a learnable random graph that captures pairwise similarities between its samples, and use it not only to learn new tasks but also to guard against forgetting. Empirical results on several benchmark datasets show that our model consistently outperforms recently proposed baselines for task-free continual learning.",
+    "title": "Graph-Based Continual Learning",
+    "authors": [
+      "Binh Tang",
+      "David S. Matteson"
+    ],
+    "emails": [
+      "~Binh_Tang1",
+      "~David_S._Matteson1"
+    ],
+    "rank": 185
+  },
+  {
+    "url": "https://openreview.net/forum?id=POWv6hDd9XH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      6,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      1,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study the challenging task of neural network quantization without end-to-end retraining, called Post-training Quantization (PTQ). PTQ usually requires a small subset of training data but produces less powerful quantized models than Quantization-Aware Training (QAT). In this work, we propose a novel PTQ framework, dubbed BRECQ, which pushes the limits of bitwidth in PTQ down to INT2 for the first time. BRECQ leverages the basic building blocks in neural networks and reconstructs them one-by-one. In a comprehensive theoretical study of the second-order error, we show that BRECQ achieves a good balance between cross-layer dependency and generalization error. To further employ the power of quantization, the mixed precision technique is incorporated in our framework by approximating the inter-layer and intra-layer sensitivity. Extensive experiments on various handcrafted and searched neural architectures are conducted for both image classification and object detection tasks. And for the first time we prove that, without bells and whistles, PTQ can attain 4-bit ResNet and MobileNetV2 comparable with QAT and enjoy 240 times faster production of quantized models.  Codes are available at <a href=\"https://github.com/yhhhli/BRECQ\" target=\"_blank\" rel=\"nofollow\">https://github.com/yhhhli/BRECQ</a>.",
+    "title": "BRECQ: Pushing the Limit of Post-Training Quantization by Block Reconstruction",
+    "authors": [
+      "Yuhang Li",
+      "Ruihao Gong",
+      "Xu Tan",
+      "Yang Yang",
+      "Peng Hu",
+      "Qi Zhang",
+      "Fengwei Yu",
+      "Wei Wang",
+      "Shi Gu"
+    ],
+    "emails": [
+      "~Fengwei_Yu1",
+      "~Qi_Zhang15",
+      "~Xu_Tan3",
+      "~Wei_Wang3",
+      "~Ruihao_Gong1",
+      "~Shi_Gu1",
+      "~Yang_Yang22",
+      "~Yuhang_Li1",
+      "~Peng_Hu3"
+    ],
+    "rank": 186
+  },
+  {
+    "url": "https://openreview.net/forum?id=o966_Is_nPA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      8
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Regularization has long been utilized to learn sparsity in deep neural network pruning. However, its role is mainly explored in the small penalty strength regime. In this work, we extend its application to a new scenario where the regularization grows large gradually to tackle two central problems of pruning: pruning schedule and weight importance scoring. (1) The former topic is newly brought up in this work, which we find critical to the pruning performance while receives little research attention. Specifically, we propose an L2 regularization variant with rising penalty factors and show it can bring significant accuracy gains compared with its one-shot counterpart, even when the same weights are removed. (2) The growing penalty scheme also brings us an approach to exploit the Hessian information for more accurate pruning without knowing their specific values, thus not bothered by the common Hessian approximation problems. Empirically, the proposed algorithms are easy to implement and scalable to large datasets and networks in both structured and unstructured pruning. Their effectiveness is demonstrated with modern deep neural networks on the CIFAR and ImageNet datasets, achieving competitive results compared to many state-of-the-art algorithms. Our code and trained models are publicly available at <a href=\"https://github.com/mingsun-tse/regularization-pruning\" target=\"_blank\" rel=\"nofollow\">https://github.com/mingsun-tse/regularization-pruning</a>.",
+    "title": "Neural Pruning via Growing Regularization",
+    "authors": [
+      "Huan Wang",
+      "Can Qin",
+      "Yulun Zhang",
+      "Yun Fu"
+    ],
+    "emails": [
+      "~Yulun_Zhang1",
+      "~Can_Qin1",
+      "~Huan_Wang3",
+      "~Yun_Fu1"
+    ],
+    "rank": 187
+  },
+  {
+    "url": "https://openreview.net/forum?id=87ZwsaQNHPZ",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      5,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Low-precision deep neural network (DNN) training has gained tremendous attention as reducing precision is one of the most effective knobs for boosting DNNs' training time/energy efficiency. In this paper, we attempt to explore low-precision training from a new perspective as inspired by recent findings in understanding DNN training: we conjecture that DNNs' precision might have a similar effect as the learning rate during DNN training, and advocate dynamic precision along the training trajectory for further boosting the time/energy efficiency of DNN training. Specifically, we propose Cyclic Precision Training (CPT) to cyclically vary the precision between two boundary values which can be identified using a simple precision range test within the first few training epochs. Extensive simulations and ablation studies on five datasets and eleven models demonstrate that CPT's effectiveness is consistent across various models/tasks (including classification and language modeling). Furthermore, through experiments and visualization we show that CPT helps to (1) converge to a wider minima with a lower generalization error and (2) reduce training variance which we believe opens up a new design knob for simultaneously improving the optimization and efficiency of DNN training.",
+    "title": "CPT: Efficient Deep Neural Network Training via Cyclic Precision",
+    "authors": [
+      "Yonggan Fu",
+      "Han Guo",
+      "Meng Li",
+      "Xin Yang",
+      "Yining Ding",
+      "Vikas Chandra",
+      "Yingyan Lin"
+    ],
+    "emails": [
+      "~Yingyan_Lin1",
+      "~Vikas_Chandra2",
+      "rice.edu",
+      "~Yonggan_Fu1",
+      "fb.com"
+    ],
+    "rank": 188
+  },
+  {
+    "url": "https://openreview.net/forum?id=fGF8qAqpXXG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      2,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We describe the convex semi-infinite dual of the two-layer vector-output ReLU neural network training problem. This semi-infinite dual admits a finite dimensional representation, but its support is over a convex set which is difficult to characterize. In particular, we demonstrate that the non-convex neural network training problem is equivalent to a finite-dimensional convex copositive program. Our work is the first to identify this strong connection between the global optima of neural networks and those of copositive programs. We thus demonstrate how neural networks implicitly attempt to solve copositive programs via semi-nonnegative matrix factorization, and draw key insights from this formulation. We describe the first algorithms for provably finding the global minimum of the vector output neural network training problem, which are polynomial in the number of samples for a fixed data rank, yet exponential in the dimension. However, in the case of convolutional architectures, the computational complexity is exponential in only the filter size and polynomial in all other parameters. We describe the circumstances in which we can find the global optimum of this neural network training problem exactly with soft-thresholded SVD, and provide a copositive relaxation which is guaranteed to be exact for certain classes of problems, and which corresponds with the solution of Stochastic Gradient Descent in practice.",
+    "title": "Vector-output ReLU Neural Network Problems are Copositive Programs: Convex Analysis of Two Layer Networks and Polynomial-time Algorithms",
+    "authors": [
+      "Arda Sahiner",
+      "Tolga Ergen",
+      "John M. Pauly",
+      "Mert Pilanci"
+    ],
+    "emails": [
+      "~Mert_Pilanci3",
+      "~John_M._Pauly1",
+      "~Arda_Sahiner1",
+      "~Tolga_Ergen1"
+    ],
+    "rank": 189
+  },
+  {
+    "url": "https://openreview.net/forum?id=6DOZ8XNNfGN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Graph Representation Learning (GRL) methods have impacted fields from chemistry to social science. However, their algorithmic implementations are specialized to specific use-cases e.g. \"message passing\" methods are run differently from \"node embedding\" ones. Despite their apparent differences, all these methods utilize the graph structure,  and therefore, their learning can be approximated with stochastic graph traversals.  We propose Graph Traversal via Tensor Functionals (GTTF), a unifying meta-algorithm framework for easing the implementation of diverse graph algorithms and enabling transparent and efficient scaling to large graphs.  GTTF is founded upon a data structure (stored as a sparse tensor) and a stochastic graph traversal algorithm (described using tensor operations). The algorithm is a functional that accept two functions, and can be specialized to obtain a variety of GRL models and objectives, simply by changing those two functions. We show for a wide class of methods, our algorithm learns in an unbiased fashion and, in expectation, approximates the learning as if the specialized implementations were run directly.\nWith these capabilities, we scale otherwise non-scalable methods to set state-of-the-art on large graph datasets while being more efficient than existing GRL libraries -- with only a handful of lines of code for each method specialization.",
+    "title": "Graph Traversal with Tensor Functionals: A Meta-Algorithm for Scalable Learning",
+    "authors": [
+      "Elan Sopher Markowitz",
+      "Keshav Balasubramanian",
+      "Mehrnoosh Mirtaheri",
+      "Sami Abu-El-Haija",
+      "Bryan Perozzi",
+      "Greg Ver Steeg",
+      "Aram Galstyan"
+    ],
+    "emails": [
+      "~Elan_Sopher_Markowitz2",
+      "~Bryan_Perozzi1",
+      "~Aram_Galstyan1",
+      "~Sami_Abu-El-Haija1",
+      "usc.edu",
+      "~Greg_Ver_Steeg1"
+    ],
+    "rank": 190
+  },
+  {
+    "url": "https://openreview.net/forum?id=1Fqg133qRaI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Training Generative Adversarial Networks (GAN) on high-fidelity images usually requires large-scale GPU-clusters and a vast number of training images. In this paper, we study the few-shot image synthesis task for GAN with minimum computing cost. We propose a light-weight GAN structure that gains superior quality on 1024^2 resolution. Notably, the model converges from scratch with just a few hours of training on a single RTX-2080 GPU, and has a consistent performance, even with less than 100 training samples. Two technique designs constitute our work, a skip-layer channel-wise excitation module and a self-supervised discriminator trained as a feature-encoder. With thirteen datasets covering a wide variety of image domains (The datasets and code are available at <a href=\"https://github.com/odegeasslbc/FastGAN-pytorch\" target=\"_blank\" rel=\"nofollow\">https://github.com/odegeasslbc/FastGAN-pytorch</a>), we show our model's superior performance compared to the state-of-the-art StyleGAN2, when data and computing budget are limited.",
+    "title": "Towards Faster and Stabilized GAN Training for High-fidelity Few-shot Image Synthesis",
+    "authors": [
+      "Bingchen Liu",
+      "Yizhe Zhu",
+      "Kunpeng Song",
+      "Ahmed Elgammal"
+    ],
+    "emails": [
+      "~Yizhe_Zhu2",
+      "~Ahmed_Elgammal1",
+      "~Bingchen_Liu2",
+      "~Kunpeng_Song1"
+    ],
+    "rank": 191
+  },
+  {
+    "url": "https://openreview.net/forum?id=gvxJzw8kW4b",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "While deep neural networks show great performance on fitting to the training distribution, improving the networks' generalization performance to the test distribution and robustness to the sensitivity to input perturbations still remain as a challenge. Although a number of mixup based augmentation strategies have been proposed to partially address them, it remains unclear as to how to best utilize the supervisory signal within each input data for mixup from the optimization perspective. We propose a new perspective on batch mixup and formulate the optimal construction of a batch of mixup data maximizing the data saliency measure of each individual mixup data and encouraging the supermodular diversity among the constructed mixup data. This leads to a novel discrete optimization problem minimizing the difference between submodular functions. We also propose an efficient modular approximation based iterative submodular minimization algorithm for efficient mixup computation per each minibatch suitable for minibatch based neural network training. Our experiments show the proposed method achieves the state of the art generalization, calibration, and weakly supervised localization results compared to other mixup methods. The source code is available at <a href=\"https://github.com/snu-mllab/Co-Mixup\" target=\"_blank\" rel=\"nofollow\">https://github.com/snu-mllab/Co-Mixup</a>.",
+    "title": "Co-Mixup: Saliency Guided Joint Mixup with Supermodular Diversity",
+    "authors": [
+      "JangHyun Kim",
+      "Wonho Choo",
+      "Hosan Jeong",
+      "Hyun Oh Song"
+    ],
+    "emails": [
+      "~JangHyun_Kim1",
+      "~Wonho_Choo1",
+      "~Hyun_Oh_Song1",
+      "~Hosan_Jeong1"
+    ],
+    "rank": 192
+  },
+  {
+    "url": "https://openreview.net/forum?id=X76iqnUbBjz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      10,
+      5,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we use the interaction inside adversarial perturbations to explain and boost the adversarial transferability. We discover and prove the negative correlation between the adversarial transferability and the interaction inside adversarial perturbations. The negative correlation is further verified through different DNNs with various inputs. Moreover, this negative correlation can be regarded as a unified perspective to understand current transferability-boosting methods. To this end, we prove that some classic methods of enhancing the transferability essentially decease interactions inside adversarial perturbations. Based on this, we propose to directly penalize interactions during the attacking process, which significantly improves the adversarial transferability. We will release the code when the paper is accepted.",
+    "title": "A Unified Approach to Interpreting and Boosting Adversarial Transferability",
+    "authors": [
+      "Xin Wang",
+      "Jie Ren",
+      "Shuyun Lin",
+      "Xiangming Zhu",
+      "Yisen Wang",
+      "Quanshi Zhang"
+    ],
+    "emails": [
+      "~Quanshi_Zhang1",
+      "~Xin_Wang25",
+      "~Shuyun_Lin1",
+      "~Jie_Ren1",
+      "~Xiangming_Zhu2",
+      "~Yisen_Wang1"
+    ],
+    "rank": 193
+  },
+  {
+    "url": "https://openreview.net/forum?id=zrT3HcsWSAt",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "We consider the problem of learning an optimal expert behavior policy given noisy demonstrations that contain observations from both optimal and non-optimal expert behaviors. Popular imitation learning algorithms, such as generative adversarial imitation learning, assume that (clear) demonstrations are given from optimal expert policies but not the non-optimal ones, and thus often fail to imitate the optimal expert behaviors given the noisy demonstrations. Prior works that address the problem require (1) learning policies through environment interactions in the same fashion as reinforcement learning, and (2) annotating each demonstration with confidence scores or rankings. However, such environment interactions and annotations in real-world settings take impractically long training time and a significant human effort. In this paper, we propose an imitation learning algorithm to address the problem without any environment interactions and annotations associated with the non-optimal demonstrations. The proposed algorithm learns ensemble policies with a generalized behavioral cloning (BC) objective function where we exploit another policy already learned by BC. Experimental results show that the proposed algorithm can learn behavior policies that are much closer to the optimal policies than ones learned by BC.",
+    "title": "Behavioral Cloning from Noisy Demonstrations",
+    "authors": [
+      "Fumihiro Sasaki",
+      "Ryota Yamashina"
+    ],
+    "emails": [
+      "jp.ricoh.com",
+      "~Fumihiro_Sasaki2"
+    ],
+    "rank": 194
+  },
+  {
+    "url": "https://openreview.net/forum?id=JoCR4h9O3Ew",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Adversarial attacks pose a major challenge for modern deep neural networks. Recent advancements show that adversarially robust generalization requires a large amount of labeled data for training. If annotation becomes a burden, can unlabeled data help bridge the gap? In this paper, we propose ARMOURED, an adversarially robust training method based on semi-supervised learning that consists of two components. The first component applies multi-view learning to simultaneously optimize multiple independent networks and utilizes unlabeled data to enforce labeling consistency. The second component reduces adversarial transferability among the networks via diversity regularizers inspired by determinantal point processes and entropy maximization. Experimental results show that under small perturbation budgets, ARMOURED is robust against strong adaptive adversaries. Notably, ARMOURED does not rely on generating adversarial samples during training. When used in combination with adversarial training, ARMOURED yields competitive performance with the state-of-the-art adversarially-robust benchmarks on SVHN and outperforms them on CIFAR-10, while offering higher clean accuracy.",
+    "title": "ARMOURED: Adversarially Robust MOdels using Unlabeled data by REgularizing Diversity",
+    "authors": [
+      "Kangkang Lu",
+      "Cuong Manh Nguyen",
+      "Xun Xu",
+      "Kiran Chari",
+      "Yu Jing Goh",
+      "Chuan-Sheng Foo"
+    ],
+    "emails": [
+      "~Yu_Jing_Goh1",
+      "~Kangkang_Lu1",
+      "~Xun_Xu1",
+      "~Kiran_Chari1",
+      "~Chuan-Sheng_Foo1",
+      "~Cuong_Manh_Nguyen1"
+    ],
+    "rank": 195
+  },
+  {
+    "url": "https://openreview.net/forum?id=NeRdBeTionN",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The embeddings from CNNs pretrained on Imagenet classification are de-facto standard image representations for assessing GANs via FID, Precision and Recall measures. Despite broad previous criticism of their usage for non-Imagenet domains, these embeddings are still the top choice in most of the GAN literature.\n\nIn this paper, we advocate the usage of the state-of-the-art self-supervised representations to evaluate GANs on the established non-Imagenet benchmarks. These representations, typically obtained via contrastive learning, are shown to provide better transfer to new tasks and domains, therefore, can serve as more universal embeddings of natural images. With extensive comparison of the recent GANs on the common datasets, we show that self-supervised representations produce a more reasonable ranking of models in terms of FID/Precision/Recall, while the ranking with classification-pretrained embeddings often can be misleading.",
+    "title": "On Self-Supervised Image Representations for GAN Evaluation",
+    "authors": [
+      "Stanislav Morozov",
+      "Andrey Voynov",
+      "Artem Babenko"
+    ],
+    "emails": [
+      "~Andrey_Voynov1",
+      "~Artem_Babenko1",
+      "~Stanislav_Morozov1"
+    ],
+    "rank": 196
+  },
+  {
+    "url": "https://openreview.net/forum?id=OPyWRrcjVQw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Explainability in AI is crucial for model development, compliance with regulation, and providing operational nuance to predictions. The Shapley framework for explainability attributes a model\u2019s predictions to its input features in a mathematically principled and model-agnostic way. However, general implementations of Shapley explainability make an untenable assumption: that the model\u2019s features are uncorrelated. In this work, we demonstrate unambiguous drawbacks of this assumption and develop two solutions to Shapley explainability that respect the data manifold. One solution, based on generative modelling, provides flexible access to data imputations; the other directly learns the Shapley value-function, providing performance and stability at the cost of flexibility. While \u201coff-manifold\u201d Shapley values can (i) give rise to incorrect explanations, (ii) hide implicit model dependence on sensitive attributes, and (iii) lead to unintelligible explanations in higher-dimensional data, on-manifold explainability overcomes these problems.\n",
+    "title": "Shapley explainability on the data manifold",
+    "authors": [
+      "Christopher Frye",
+      "Damien de Mijolla",
+      "Tom Begley",
+      "Laurence Cowton",
+      "Megan Stanley",
+      "Ilya Feige"
+    ],
+    "emails": [
+      "~Christopher_Frye1",
+      "microsoft.com",
+      "faculty.ai",
+      "gmail.com",
+      "~Ilya_Feige1",
+      "~Tom_Begley1"
+    ],
+    "rank": 197
+  },
+  {
+    "url": "https://openreview.net/forum?id=8yKEo06dKNo",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Mixup is a popular data augmentation technique based on on convex combinations of pairs of examples and their labels. This simple technique has shown to substantially improve both the model's robustness as well as the generalization of the trained model. However,  it is not well-understood why such improvement occurs. In this paper, we provide theoretical analysis to demonstrate how using Mixup in training helps model robustness and generalization. For robustness, we show that minimizing the Mixup loss corresponds to approximately minimizing an upper bound of the adversarial loss. This explains why models obtained by Mixup training exhibits robustness to several kinds of adversarial attacks such as Fast Gradient Sign Method (FGSM). For generalization, we prove that Mixup augmentation corresponds to a specific type of data-adaptive regularization which reduces overfitting. Our analysis provides new insights and a framework to understand Mixup.\n",
+    "title": "How Does Mixup Help With Robustness and Generalization?",
+    "authors": [
+      "Linjun Zhang",
+      "Zhun Deng",
+      "Kenji Kawaguchi",
+      "Amirata Ghorbani",
+      "James Zou"
+    ],
+    "emails": [
+      "~Zhun_Deng1",
+      "~Amirata_Ghorbani2",
+      "~James_Zou1",
+      "~Kenji_Kawaguchi1",
+      "rutgers.edu"
+    ],
+    "rank": 198
+  },
+  {
+    "url": "https://openreview.net/forum?id=9EsrXMzlFQY",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      2,
+      2,
+      5,
+      3
+    ],
+    "abstract": "Regularization by denoising (RED) is a recently developed framework for solving inverse problems by integrating advanced denoisers as image priors. Recent work has shown its state-of-the-art performance when combined with pre-trained deep denoisers. However, current RED algorithms are inadequate for parallel processing on multicore systems. We address this issue by proposing a new{asynchronous RED (Async-RED) algorithm that enables asynchronous parallel processing of data, making it significantly faster than its serial counterparts for large-scale inverse problems. The computational complexity of Async-RED is further reduced by using a random subset of measurements at every iteration. We present a complete theoretical analysis of the algorithm by establishing its convergence under explicit assumptions on the data-fidelity and the denoiser. We validate Async-RED on image recovery using pre-trained deep denoisers as priors.",
+    "title": "Async-RED: A Provably Convergent Asynchronous Block Parallel Stochastic Method using Deep Denoising Priors",
+    "authors": [
+      "Yu Sun",
+      "Jiaming Liu",
+      "Yiran Sun",
+      "Brendt Wohlberg",
+      "Ulugbek Kamilov"
+    ],
+    "emails": [
+      "~Brendt_Wohlberg2",
+      "wustl.edu",
+      "~Yu_Sun11",
+      "~Ulugbek_Kamilov1"
+    ],
+    "rank": 199
+  },
+  {
+    "url": "https://openreview.net/forum?id=jznizqvr15J",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Consider a prediction setting with few in-distribution labeled examples and many unlabeled examples both in- and out-of-distribution (OOD). The goal is to learn a model which performs well both in-distribution and OOD. In these settings, auxiliary information is often cheaply available for every input. How should we best leverage this auxiliary information for the prediction task? Empirically across three image and time-series datasets, and theoretically in a multi-task linear regression setting, we show that (i) using auxiliary information as input features improves in-distribution error but can hurt OOD error; but (ii) using auxiliary information as outputs of auxiliary pre-training tasks improves OOD error. To get the best of both worlds, we introduce In-N-Out, which first trains a model with auxiliary inputs and uses it to pseudolabel all the in-distribution inputs, then pre-trains a model on OOD auxiliary outputs and fine-tunes this model with the pseudolabels (self-training). We show both theoretically and empirically that In-N-Out outperforms auxiliary inputs or outputs alone on both in-distribution and OOD error.",
+    "title": "In-N-Out: Pre-Training and Self-Training using Auxiliary Information for Out-of-Distribution Robustness",
+    "authors": [
+      "Sang Michael Xie",
+      "Ananya Kumar",
+      "Robbie Jones",
+      "Fereshte Khani",
+      "Tengyu Ma",
+      "Percy Liang"
+    ],
+    "emails": [
+      "~Percy_Liang1",
+      "stanford.edu",
+      "~Ananya_Kumar1",
+      "~Fereshte_Khani1",
+      "~Sang_Michael_Xie1",
+      "~Tengyu_Ma1"
+    ],
+    "rank": 200
+  },
+  {
+    "url": "https://openreview.net/forum?id=rABUmU3ulQh",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      5,
+      5,
+      3
+    ],
+    "abstract": "In this work, we present a probabilistic 3D generative model, named Generative Cellular Automata, which is able to produce diverse and high quality shapes. We formulate the shape generation process as sampling from the transition kernel of a Markov chain, where the sampling chain eventually evolves to the full shape of the learned distribution. The transition kernel employs the local update rules of cellular automata, effectively reducing the search space in a high-resolution 3D grid space by exploiting the connectivity and sparsity of 3D shapes. Our progressive generation only focuses on the sparse set of occupied voxels and their neighborhood, thus enables the utilization of an expressive sparse convolutional network. We propose an effective training scheme to obtain the local homogeneous rule of generative cellular automata with sequences that are slightly different from the sampling chain but converge to the full shapes in the training data. Extensive experiments on probabilistic shape completion and shape generation demonstrate that our method achieves competitive performance against recent methods.",
+    "title": "Learning to Generate 3D Shapes with Generative Cellular Automata",
+    "authors": [
+      "Dongsu Zhang",
+      "Changwoon Choi",
+      "Jeonghwan Kim",
+      "Young Min Kim"
+    ],
+    "emails": [
+      "~Young_Min_Kim1",
+      "~Changwoon_Choi1",
+      "snu.ac.kr",
+      "~Dongsu_Zhang1"
+    ],
+    "rank": 201
+  },
+  {
+    "url": "https://openreview.net/forum?id=xfmSoxdxFCG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "The mushroom body of the fruit fly brain is one of the best studied systems in neuroscience. At its core it consists of a population of Kenyon cells, which receive inputs from multiple sensory modalities. These cells are inhibited by the anterior paired lateral neuron, thus creating a sparse high dimensional representation of the inputs. In this work we study a mathematical formalization of this network motif and apply it to learning the correlational structure between words and their context in a corpus of unstructured text, a common natural language processing (NLP) task. We show that this network can learn semantic representations of words and can generate both static and context-dependent word embeddings. Unlike conventional methods (e.g., BERT, GloVe) that use dense representations for word embedding, our algorithm encodes semantic meaning of words and their context in the form of sparse binary hash codes. The quality of the learned representations is evaluated on word similarity analysis, word-sense disambiguation, and document classification. It is shown that not only can the fruit fly network motif achieve performance comparable to existing methods in NLP, but, additionally, it uses only a fraction of the computational resources (shorter training time and smaller memory footprint). ",
+    "title": "Can a Fruit Fly Learn Word Embeddings?",
+    "authors": [
+      "Yuchen Liang",
+      "Chaitanya Ryali",
+      "Benjamin Hoover",
+      "Leopold Grinberg",
+      "Saket Navlakha",
+      "Mohammed J Zaki",
+      "Dmitry Krotov"
+    ],
+    "emails": [
+      "rpi.edu",
+      "~Mohammed_J_Zaki1",
+      "~Dmitry_Krotov2",
+      "gmail.com",
+      "ibm.com",
+      "cshl.edu",
+      "~Chaitanya_Ryali1"
+    ],
+    "rank": 202
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rd138pWXMvG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      7,
+      6,
+      6
+    ],
+    "rating": "7.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "To get Bayesian neural networks to perform comparably to standard neural networks it is usually necessary to artificially reduce uncertainty using a tempered or cold posterior. This is extremely concerning: if the prior is accurate, Bayes inference/decision theory is optimal, and any artificial changes to the posterior should harm performance. While this suggests that the prior may be at fault, here we argue that in fact, BNNs for image classification use the wrong likelihood. In particular, standard image benchmark datasets such as CIFAR-10 are carefully curated. We develop a generative model describing curation which gives a principled Bayesian account of cold posteriors, because the likelihood under this new generative model closely matches the tempered likelihoods used in past work.",
+    "title": "A statistical theory of cold posteriors in deep neural networks",
+    "authors": [
+      "Laurence Aitchison"
+    ],
+    "emails": [
+      "~Laurence_Aitchison1"
+    ],
+    "rank": 203
+  },
+  {
+    "url": "https://openreview.net/forum?id=zDy_nQCXiIj",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.94",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Recent research has shown remarkable success in revealing \"steering\" directions in the latent spaces of pre-trained GANs. These directions correspond to semantically meaningful image transformations (e.g., shift, zoom, color manipulations), and have the same interpretable effect across all categories that the GAN can generate. Some methods focus on user-specified transformations, while others discover transformations in an unsupervised manner. However, all existing techniques rely on an optimization procedure to expose those directions, and offer no control over the degree of allowed interaction between different transformations. In this paper, we show that \"steering\" trajectories can be computed in closed form directly from the generator's weights without any form of training or optimization. This applies to user-prescribed geometric transformations, as well as to unsupervised discovery of more complex effects. Our approach allows determining both linear and nonlinear trajectories, and has many advantages over previous methods. In particular, we can control whether one transformation is allowed to come on the expense of another (e.g., zoom-in with or without allowing translation to keep the object centered). Moreover, we can determine the natural end-point of the trajectory, which corresponds to the largest extent to which a transformation can be applied without incurring degradation. Finally, we show how transferring attributes between images can be achieved without optimization, even across different categories.\n",
+    "title": "GAN \"Steerability\" without optimization ",
+    "authors": [
+      "Nurit Spingarn",
+      "Ron Banner",
+      "Tomer Michaeli"
+    ],
+    "emails": [
+      "~Ron_Banner1",
+      "~Nurit_Spingarn1",
+      "~Tomer_Michaeli1"
+    ],
+    "rank": 204
+  },
+  {
+    "url": "https://openreview.net/forum?id=vVjIW3sEc1s",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.94",
+    "confidences": [
+      3,
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Autoregressive language models, pretrained using large text corpora to do well on next word prediction, have been successful at solving many downstream tasks, even with zero-shot usage. However, there is little theoretical understanding of this success. This paper initiates a mathematical study of this phenomenon for the downstream task of text classification by considering the following questions: (1) What is the intuitive connection between the pretraining task of next word prediction and text classification? (2) How can we mathematically formalize this connection and quantify the benefit of language modeling? For (1), we hypothesize, and verify empirically, that classification tasks of interest can be reformulated as sentence completion tasks, thus making language modeling a meaningful pretraining task. With a mathematical formalization of this hypothesis, we make progress towards (2) and show that language models that are <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-optimal in cross-entropy (log-perplexity) learn features that can linearly solve such classification tasks with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.286em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msqrt><mi>\u03f5</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> error, thus demonstrating that doing well on language modeling can be beneficial for downstream tasks. We experimentally verify various assumptions and theoretical findings, and also use insights from the analysis to design a new objective function that performs well on some classification tasks.",
+    "title": "A Mathematical Exploration of Why Language Models Help Solve Downstream Tasks",
+    "authors": [
+      "Nikunj Saunshi",
+      "Sadhika Malladi",
+      "Sanjeev Arora"
+    ],
+    "emails": [
+      "~Sanjeev_Arora1",
+      "~Nikunj_Saunshi1",
+      "~Sadhika_Malladi2"
+    ],
+    "rank": 205
+  },
+  {
+    "url": "https://openreview.net/forum?id=q_S44KLQ_Aa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      8,
+      8
+    ],
+    "rating": "6.94",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": " It is well-established that many iterative sparse reconstruction algorithms can be unrolled to yield a learnable neural network for improved empirical performance. A prime example is learned ISTA (LISTA) where weights, step sizes and thresholds are learned from training data. Recently, Analytic LISTA (ALISTA) has been introduced, combining the strong empirical performance of a fully learned approach like LISTA, while retaining theoretical guarantees of classical compressed sensing algorithms and significantly reducing the number of parameters to learn. However, these parameters are trained to work in expectation, often leading to suboptimal reconstruction of individual targets.  In this work we therefore introduce Neurally Augmented ALISTA, in which an LSTM network is used to compute step sizes and thresholds individually for each target vector during reconstruction. This adaptive approach is theoretically motivated by revisiting the recovery guarantees of ALISTA. We show that our approach further improves empirical performance in sparse reconstruction, in particular outperforming existing algorithms by an increasing margin as the compression ratio becomes more challenging.",
+    "title": "Neurally Augmented ALISTA",
+    "authors": [
+      "Freya Behrens",
+      "Jonathan Sauder",
+      "Peter Jung"
+    ],
+    "emails": [
+      "~Freya_Behrens1",
+      "~Peter_Jung2",
+      "~Jonathan_Sauder2"
+    ],
+    "rank": 206
+  },
+  {
+    "url": "https://openreview.net/forum?id=eU776ZYxEpz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      9
+    ],
+    "rating": "6.94",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": " The units in artificial neural networks (ANNs) can be thought of as abstractions of biological neurons, and ANNs are increasingly used in neuroscience research. However, there are many important differences between ANN units and real neurons. One of the most notable is the absence of Dale's principle, which ensures that biological neurons are either exclusively excitatory or inhibitory. Dale's principle is typically left out of ANNs because its inclusion impairs learning. This is problematic, because one of the great advantages of ANNs for neuroscience research is their ability to learn complicated, realistic tasks. Here, by taking inspiration from feedforward inhibitory interneurons in the brain we show that we can develop ANNs with separate populations of excitatory and inhibitory units that learn just as well as standard ANNs. We call these networks Dale's ANNs (DANNs). We present two insights that enable DANNs to learn well: (1) DANNs are related to normalization schemes, and can be initialized such that the inhibition centres and standardizes the excitatory activity, (2) updates to inhibitory neuron parameters should be scaled using corrections based on the Fisher Information matrix. These results demonstrate how ANNs that respect Dale's principle can be built without sacrificing learning performance, which is important for future work using ANNs as models of the brain. The results may also have interesting implications for how inhibitory plasticity in the real brain operates.",
+    "title": "Learning to live with Dale's principle: ANNs with separate excitatory and inhibitory units",
+    "authors": [
+      "Jonathan Cornford",
+      "Damjan Kalajdzievski",
+      "Marco Leite",
+      "Am\u00e9lie Lamarquette",
+      "Dimitri Michael Kullmann",
+      "Blake Aaron Richards"
+    ],
+    "emails": [
+      "~Jonathan_Cornford1",
+      "ucl.ac.uk",
+      "cam.ac.uk",
+      "gmail.com",
+      "~Blake_Aaron_Richards1"
+    ],
+    "rank": 207
+  },
+  {
+    "url": "https://openreview.net/forum?id=lQdXeXDoWtI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.94",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "The goal of domain generalization algorithms is to predict well on distributions different from those seen during training.\nWhile a myriad of domain generalization algorithms exist, inconsistencies in experimental conditions---datasets, network architectures, and model selection criteria---render fair comparisons difficult.\nThe goal of this paper is to understand how useful domain generalization algorithms are in realistic settings.\nAs a first step, we realize that model selection is non-trivial for domain generalization tasks, and we argue that algorithms without a model selection criterion remain incomplete.\nNext we implement DomainBed, a testbed for domain generalization including seven benchmarks, fourteen algorithms, and three model selection criteria.\nWhen conducting extensive experiments using DomainBed we find that when carefully implemented and tuned, ERM outperforms the state-of-the-art in terms of average performance.\nFurthermore, no algorithm included in DomainBed outperforms ERM by more than one point when evaluated under the same experimental conditions.\nWe hope that the release of DomainBed, alongside contributions from fellow researchers, will streamline reproducible and rigorous advances in domain generalization.",
+    "title": "In Search of Lost Domain Generalization",
+    "authors": [
+      "Ishaan Gulrajani",
+      "David Lopez-Paz"
+    ],
+    "emails": [
+      "~David_Lopez-Paz2",
+      "~Ishaan_Gulrajani1"
+    ],
+    "rank": 208
+  },
+  {
+    "url": "https://openreview.net/forum?id=KJNcAkY8tY4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      6,
+      7
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "A key factor in the success of deep neural networks is the ability to scale models to improve performance by varying the architecture depth and width. This simple property of neural network design has resulted in highly effective architectures for a variety of tasks. Nevertheless, there is limited understanding of effects of depth and width on the learned representations. In this paper, we study this fundamental question. We begin by investigating how varying depth and width affects model hidden representations, finding a characteristic block structure in the hidden representations of larger capacity (wider or deeper) models. We demonstrate that this block structure arises when model capacity is large relative to the size of the training set, and is indicative of the underlying layers preserving and propagating the dominant principal component of their representations. This discovery has important ramifications for features learned by different models, namely, representations outside the block structure are often similar across architectures with varying widths and depths, but the block structure is unique to each model. We analyze the output predictions of different model architectures, finding that even when the overall accuracy is similar, wide and deep models exhibit distinctive error patterns and variations across classes.",
+    "title": "Do Wide and Deep Networks Learn the Same Things? Uncovering How Neural Network Representations Vary with Width and Depth",
+    "authors": [
+      "Thao Nguyen",
+      "Maithra Raghu",
+      "Simon Kornblith"
+    ],
+    "emails": [
+      "~Thao_Nguyen3",
+      "~Maithra_Raghu1",
+      "~Simon_Kornblith1"
+    ],
+    "rank": 209
+  },
+  {
+    "url": "https://openreview.net/forum?id=8wqCDnBmnrT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Visual cognition of primates is superior to that of artificial neural networks in its ability to \u201cenvision\u201d a visual object, even a newly-introduced one, in different attributes including pose, position, color, texture, etc.  To aid neural networks to envision objects with different attributes,  we propose a family of objective functions, expressed on groups of examples, as a novel learning framework that we term Group-Supervised Learning (GSL). GSL allows us to decompose inputs into a disentangled representation with swappable components, that can be recombined to synthesize new samples.  For instance, images of red boats &amp; blue cars can be decomposed and recombined to synthesize novel images of red cars.   We propose an implementation based on auto-encoder, termed group-supervised zero-shot synthesis network (GZS-Net) trained with our learning framework, that can produce a high-quality red car even if no such example is witnessed during training. We test our model and learning framework on existing benchmarks, in addition to a new dataset that we open-source. We qualitatively and quantitatively demonstrate that GZS-Net trained with GSL outperforms state-of-the-art methods",
+    "title": "Zero-shot Synthesis with Group-Supervised Learning",
+    "authors": [
+      "Yunhao Ge",
+      "Sami Abu-El-Haija",
+      "Gan Xin",
+      "Laurent Itti"
+    ],
+    "emails": [
+      "~Yunhao_Ge1",
+      "~Gan_Xin1",
+      "~Laurent_Itti1",
+      "~Sami_Abu-El-Haija1"
+    ],
+    "rank": 210
+  },
+  {
+    "url": "https://openreview.net/forum?id=4D4Rjrwaw3q",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      9
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Existing studies in black-box optimization for machine learning suffer from low\ngeneralizability, caused by a typically selective choice of problem instances used\nfor training and testing different optimization algorithms. Among other issues,\nthis practice promotes overfitting and poor-performing user guidelines. To address\nthis shortcoming, we propose in this work a benchmark suite, OptimSuite,\nwhich covers a broad range of black-box optimization problems, ranging from\nacademic benchmarks to real-world applications, from discrete over numerical\nto mixed-integer problems, from small to very large-scale problems, from noisy\nover dynamic to static problems, etc. We demonstrate the advantages of such a\nbroad collection by deriving from it Automated Black Box Optimizer (ABBO), a\ngeneral-purpose algorithm selection wizard. Using three different types of algorithm\nselection techniques, ABBO achieves competitive performance on all\nbenchmark suites. It significantly outperforms previous state of the art on some of\nthem, including YABBOB and LSGO. ABBO relies on many high-quality base\ncomponents. Its excellent performance is obtained without any task-specific\nparametrization. The benchmark collection, the ABBO wizard, its base solvers,\nas well as all experimental data are reproducible and open source in OptimSuite.",
+    "title": "Black-Box Optimization Revisited: Improving Algorithm Selection Wizards through Massive Benchmarking",
+    "authors": [
+      "Laurent Meunier",
+      "Herilalaina Rakotoarison",
+      "Jeremy Rapin",
+      "Paco Wong",
+      "Baptiste Roziere",
+      "Olivier Teytaud",
+      "Antoine Moreau",
+      "Carola Doerr"
+    ],
+    "emails": [
+      "~Olivier_Teytaud2",
+      "~Herilalaina_Rakotoarison1",
+      "uca.fr",
+      "~Carola_Doerr1",
+      "gmail.com",
+      "~Laurent_Meunier1",
+      "fb.com"
+    ],
+    "rank": 211
+  },
+  {
+    "url": "https://openreview.net/forum?id=dKg5D1Z1Lm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Off-policy evaluation (OPE) is the task of estimating the expected reward of a given policy based on offline data previously collected under different policies. Therefore, OPE is a key step in applying reinforcement learning to real-world domains such as medical treatment, where interactive data collection is expensive or even unsafe. As the observed data tends to be noisy and limited, it is essential to provide rigorous  uncertainty quantification, not just a point estimation, when applying OPE to make high stakes decisions. This work considers the problem of constructing non-asymptotic confidence intervals in infinite-horizon  off-policy evaluation, which remains a challenging open question. We develop a practical algorithm through a primal-dual optimization-based approach, which leverages the kernel Bellman loss (KBL) of Feng et al. 2019 and a new  martingale concentration inequality of KBL applicable to time-dependent data with unknown mixing conditions. Our algorithm makes  minimum assumptions on the data and the function class of the Q-function,  and works for the behavior-agnostic settings where the data is collected under a mix of arbitrary unknown behavior policies.  We present empirical results that clearly demonstrate the advantages of our approach over existing methods.\n",
+    "title": "Non-asymptotic Confidence Intervals of Off-policy Evaluation:  Primal and Dual Bounds ",
+    "authors": [
+      "Yihao Feng",
+      "Ziyang Tang",
+      "na zhang",
+      "qiang liu"
+    ],
+    "emails": [
+      "~Ziyang_Tang1",
+      "~qiang_liu4",
+      "pbcsf.tsinghua.edu.cn",
+      "~Yihao_Feng1"
+    ],
+    "rank": 212
+  },
+  {
+    "url": "https://openreview.net/forum?id=PUkhWz65dy5",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study the problem of how to construct a set of policies that can be composed together to solve a collection of reinforcement learning tasks. Each task is a different reward function defined as a linear combination of  known features. We consider a specific class of policy compositions which we call set improving policies (SIPs): given a set of policies and a set of tasks, a SIP is any composition of the former whose performance is at least as good as that of its constituents across all the tasks. We focus on the most conservative instantiation of SIPs, set-max policies (SMPs), so our analysis extends to any SIP. This includes known policy-composition operators like generalized policy improvement. Our main contribution is an algorithm that builds a set of policies in order to maximize the worst-case performance of the resulting SMP on the set of tasks. The algorithm works by successively adding new policies to the set. We show that the worst-case performance of the resulting SMP strictly improves at each iteration, and the algorithm only stops when there does not exist a policy that leads to improved performance. We empirically evaluate our algorithm on a grid world and also on a set of domains from the DeepMind control suite. We confirm our theoretical results regarding the monotonically improving performance of our algorithm. Interestingly, we also show empirically that the sets of policies computed by the algorithm are diverse, leading to different trajectories in the grid world and very distinct locomotion skills in the control suite.",
+    "title": "Discovering a set of policies for the worst case reward",
+    "authors": [
+      "Tom Zahavy",
+      "Andre Barreto",
+      "Daniel J Mankowitz",
+      "Shaobo Hou",
+      "Brendan O'Donoghue",
+      "Iurii Kemaev",
+      "Satinder Singh"
+    ],
+    "emails": [
+      "~Satinder_Singh2",
+      "~Brendan_O'Donoghue1",
+      "~Daniel_J_Mankowitz2",
+      "~Tom_Zahavy2",
+      "google.com",
+      "~Shaobo_Hou1",
+      "~Andre_Barreto1"
+    ],
+    "rank": 213
+  },
+  {
+    "url": "https://openreview.net/forum?id=z5Z023VBmDZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      8,
+      5,
+      7
+    ],
+    "rating": "6.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Convolutional neural networks are utilized to solve increasingly more complex problems and with more data. As a  result, researchers and practitioners seek to scale the representational power of such models by adding more parameters. However, increasing parameters requires additional critical resources in terms of memory and compute,  leading to increased training and inference cost. Thus a consistent challenge is to obtain as high as possible accuracy within a parameter budget. As neural network designers navigate this complex landscape, they are guided by conventional wisdom that is informed from past empirical studies. We identify a critical part of this design space that is not well-understood: How to decide between the alternatives of expanding a single convolutional network model or increasing the number of networks in the form of an ensemble. We study this question in detail across various network architectures and data sets. We build an extensive experimental framework that captures numerous angles of the possible design space in terms of how a new set of parameters can be used in a model.  We consider a holistic set of metrics such as training time, inference time, and memory usage. The framework provides a robust assessment by making sure it controls for the number of parameters. Contrary to conventional wisdom, we show that when we perform a holistic and robust assessment, we uncover a wide design space, where ensembles provide better accuracy, train faster, and deploy at speed comparable to single convolutional networks with the same total number of parameters. ",
+    "title": "More or Less: When and How to Build Convolutional Neural Network Ensembles",
+    "authors": [
+      "Abdul Wasay",
+      "Stratos Idreos"
+    ],
+    "emails": [
+      "~Abdul_Wasay1",
+      "~Stratos_Idreos1"
+    ],
+    "rank": 214
+  },
+  {
+    "url": "https://openreview.net/forum?id=krz7T0xU9Z_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      8,
+      7
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study the inductive bias of two-layer ReLU networks trained by gradient flow. We identify a class of easy-to-learn (`orthogonally separable') datasets, and characterise the solution that ReLU networks trained on such datasets converge to. Irrespective of network width, the solution turns out to be a combination of two max-margin classifiers: one corresponding to the positive data subset and one corresponding to the negative data subset.\nThe proof is based on the recently introduced concept of extremal sectors, for which we prove a number of properties in the context of orthogonal separability. In particular, we prove stationarity of activation patterns from some time <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> onwards, which enables a reduction of the ReLU network to an ensemble of linear subnetworks.\n",
+    "title": "The inductive bias of ReLU networks on orthogonally separable data",
+    "authors": [
+      "Mary Phuong",
+      "Christoph H Lampert"
+    ],
+    "emails": [
+      "~Christoph_H_Lampert1",
+      "~Mary_Phuong1"
+    ],
+    "rank": 215
+  },
+  {
+    "url": "https://openreview.net/forum?id=vYVI1CHPaQg",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.93",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Modern large-scale machine learning applications require stochastic optimization algorithms to be implemented on distributed computing systems. A key bottleneck of such systems is the communication overhead for exchanging information across the workers, such as stochastic gradients. Among the many techniques proposed to remedy this issue, one of the most successful is the framework of compressed communication with error feedback (EF). EF remains the only known technique that can deal with the error induced by contractive compressors which are not unbiased, such as Top-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> or PowerSGD.  In this paper, we propose a new and theoretically and practically better alternative to EF for dealing with contractive compressors. In particular, we propose a construction which can transform any contractive compressor into an induced unbiased compressor. Following this transformation, existing methods able to work with unbiased compressors can be applied. We show that our approach leads to vast improvements over EF, including reduced memory requirements, better communication complexity guarantees and fewer assumptions. We further extend our results to federated learning with partial participation following an arbitrary distribution over the nodes and demonstrate the benefits thereof. We perform several numerical experiments which validate our theoretical findings.",
+    "title": "A Better Alternative to Error Feedback for Communication-Efficient Distributed Learning",
+    "authors": [
+      "Samuel Horv\u00e1th",
+      "Peter Richtarik"
+    ],
+    "emails": [
+      "~Samuel_Horv\u00e1th1",
+      "~Peter_Richtarik1"
+    ],
+    "rank": 216
+  },
+  {
+    "url": "https://openreview.net/forum?id=lgNx56yZh8a",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.93",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Few-shot classification (FSC), the task of adapting a classifier to unseen classes given a small labeled dataset, is an important step on the path toward human-like machine learning. Bayesian methods are well-suited to tackling the fundamental issue of overfitting in the few-shot scenario because they allow practitioners to specify prior beliefs and update those beliefs in light of observed data. Contemporary approaches to Bayesian few-shot classification maintain a posterior distribution over model parameters, which is slow and requires storage that scales with model size. Instead, we propose a Gaussian process classifier based on a novel combination of P\u00f3lya-Gamma augmentation and the one-vs-each softmax approximation that allows us to efficiently marginalize over functions rather than model parameters. We demonstrate improved accuracy and uncertainty quantification on both standard few-shot classification benchmarks and few-shot domain transfer tasks.",
+    "title": "Bayesian Few-Shot Classification with One-vs-Each P\u00f3lya-Gamma Augmented Gaussian Processes",
+    "authors": [
+      "Jake Snell",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Jake_Snell1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 217
+  },
+  {
+    "url": "https://openreview.net/forum?id=bodgPrarPUJ",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.92",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "This paper introduces new parameterizations of equilibrium neural networks, i.e.\nnetworks defined by implicit equations. This model class includes standard multilayer\nand residual networks as special cases. The new parameterization admits\na Lipschitz bound during training via unconstrained optimization, i.e. no projections\nor barrier functions are required. Lipschitz bounds are a common proxy for\nrobustness and appear in many generalization bounds. Furthermore, compared to\nprevious works we show well-posedness (existence of solutions) under less restrictive\nconditions on the network weights and more natural assumptions on the\nactivation functions: that they are monotone and slope restricted. These results\nare proved by establishing novel connections with convex optimization, operator\nsplitting on non-Euclidean spaces, and contracting neural ODEs. In image classification\nexperiments we show that the Lipschitz bounds are very accurate and\nimprove robustness to adversarial attacks.",
+    "title": "Lipschitz-Bounded Equilibrium Networks",
+    "authors": [
+      "Max Revay",
+      "Ruigang Wang",
+      "Ian Manchester"
+    ],
+    "emails": [
+      "~Ian_Manchester1",
+      "~Max_Revay1",
+      "~Ruigang_Wang2"
+    ],
+    "rank": 218
+  },
+  {
+    "url": "https://openreview.net/forum?id=cTQnZPLIohy",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      8,
+      6
+    ],
+    "rating": "6.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Existing methods for incorporating symmetries into neural network architectures require prior knowledge of the symmetry group. We propose to learn the symmetries during the training of the group equivariant architectures.  Our model, the Lie algebra convolutional network (L-conv), is based on infinitesimal generators of continuous groups and does not require discretization or integration over the group. We show that L-conv can approximate any group convolutional layer by composition of layers. We demonstrate how CNNs, Graph Convolutional Networks and fully-connected networks can all be expressed as an L-conv with appropriate groups. By allowing the infinitesimal generators to be learnable, L-conv can learn potential symmetries. We also show how the symmetries are related to the statistics of the dataset in linear settings.  We find an analytical relationship between the symmetry group and a subgroup of an orthogonal group preserving the covariance of the input. Our experiments show that L-conv with trainable generators performs well on problems with hidden symmetries. Due to parameter sharing, L-conv also uses far fewer parameters than fully-connected layers.",
+    "title": "Lie Algebra Convolutional Neural Networks with Automatic Symmetry Extraction",
+    "authors": [
+      "Nima Dehmamy",
+      "Yanchen Liu",
+      "Robin Walters",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Rose_Yu1",
+      "gmail.com",
+      "~Robin_Walters1",
+      "~Nima_Dehmamy1"
+    ],
+    "rank": 219
+  },
+  {
+    "url": "https://openreview.net/forum?id=fgd7we_uZa6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      8
+    ],
+    "rating": "6.92",
+    "confidences": [
+      4,
+      2,
+      2,
+      5
+    ],
+    "abstract": "A recent line of research on deep learning focuses on the extremely over-parameterized setting, and shows that when the network width is larger than a high degree polynomial of the training sample size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> and the inverse of the target error <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>1</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>, deep neural networks learned by (stochastic) gradient descent enjoy nice optimization and generalization guarantees. Very recently, it is shown that under certain margin assumptions on the training data, a polylogarithmic width condition suffices for two-layer ReLU networks to converge and generalize (Ji and Telgarsky, 2020). However, whether deep neural networks can be learned with such a mild over-parameterization is still an open question. In this work, we answer this question affirmatively and establish sharper learning guarantees for deep ReLU networks trained by (stochastic) gradient descent. In specific, under certain assumptions made in previous work, our optimization and generalization guarantees hold with network width polylogarithmic in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>1</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>. Our results push the study of over-parameterized deep neural networks towards more practical settings.",
+    "title": "How Much Over-parameterization Is Sufficient to Learn Deep ReLU Networks?",
+    "authors": [
+      "Zixiang Chen",
+      "Yuan Cao",
+      "Difan Zou",
+      "Quanquan Gu"
+    ],
+    "emails": [
+      "~Quanquan_Gu1",
+      "~Yuan_Cao1",
+      "~Zixiang_Chen1",
+      "~Difan_Zou1"
+    ],
+    "rank": 220
+  },
+  {
+    "url": "https://openreview.net/forum?id=X4y_10OX-hX",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.92",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Dense Associative Memories or modern Hopfield networks permit storage and reliable  retrieval of an exponentially large (in the dimension of feature space) number of memories. At the same time, their naive implementation is non-biological, since it seemingly requires the existence of many-body synaptic junctions between the neurons.  We show that these models are effective descriptions of a more microscopic (written in terms of biological degrees of freedom) theory that has additional (hidden) neurons and only requires two-body interactions between them. For this reason our proposed microscopic theory is a valid model of large associative memory with a degree of biological plausibility. The dynamics of our network and its reduced dimensional equivalent both minimize energy (Lyapunov) functions. When certain dynamical variables (hidden neurons) are integrated out from our microscopic theory, one can recover many of the models that were previously discussed in the literature, e.g. the model presented in \"Hopfield Networks is All You Need\" paper. We also provide an alternative derivation of the energy function and the update rule proposed in the aforementioned paper and clarify the relationships between various models of this class.",
+    "title": "Large Associative Memory Problem in Neurobiology and Machine Learning",
+    "authors": [
+      "Dmitry Krotov",
+      "John J. Hopfield"
+    ],
+    "emails": [
+      "~Dmitry_Krotov2",
+      "~John_J._Hopfield1"
+    ],
+    "rank": 221
+  },
+  {
+    "url": "https://openreview.net/forum?id=TTUVg6vkNjK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6
+    ],
+    "rating": "6.91",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Role-based learning holds the promise of achieving scalable multi-agent learning by decomposing complex tasks using roles. However, it is largely unclear how to efficiently discover such a set of roles. To solve this problem, we propose to first decompose joint action spaces into restricted role action spaces by clustering actions according to their effects on the environment and other agents. Learning a role selector based on action effects makes role discovery much easier because it forms a bi-level learning hierarchy: the role selector searches in a smaller role space and at a lower temporal resolution, while role policies learn in significantly reduced primitive action-observation spaces. We further integrate information about action effects into the role policies to boost learning efficiency and policy generalization. By virtue of these advances, our method (1) outperforms the current state-of-the-art MARL algorithms on 9 of the 14 scenarios that comprise the challenging StarCraft II micromanagement benchmark and (2) achieves rapid transfer to new environments with three times the number of agents. Demonstrative videos can be viewed at <a href=\"https://sites.google.com/view/rode-marl\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/rode-marl</a>.",
+    "title": "RODE: Learning Roles to Decompose Multi-Agent Tasks",
+    "authors": [
+      "Tonghan Wang",
+      "Tarun Gupta",
+      "Anuj Mahajan",
+      "Bei Peng",
+      "Shimon Whiteson",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "~Shimon_Whiteson1",
+      "~Chongjie_Zhang1",
+      "~Tarun_Gupta3",
+      "~Anuj_Mahajan1",
+      "~Tonghan_Wang1",
+      "~Bei_Peng2"
+    ],
+    "rank": 222
+  },
+  {
+    "url": "https://openreview.net/forum?id=Pd_oMxH8IlF",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      6,
+      7,
+      8
+    ],
+    "rating": "6.90",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Although neural module networks have an architectural bias towards compositionality, they require gold standard layouts to generalize systematically in practice. When instead learning layouts and modules jointly, compositionality does not arise automatically and an explicit pressure is necessary for the emergence of layouts exhibiting the right structure. We propose to address this problem using iterated learning, a cognitive science theory of the emergence of compositional languages in nature that has primarily been applied to simple referential games in machine learning. Considering the layouts of module networks as samples from an emergent language, we use iterated learning to encourage the development of structure within this language. We show that the resulting layouts support systematic generalization in neural agents solving the more complex task of visual question-answering. Our regularized iterated learning method can outperform baselines without iterated learning on SHAPES-SyGeT (SHAPES Systematic Generalization Test), a new split of the SHAPES dataset we introduce to evaluate systematic generalization, and on CLOSURE, an extension of CLEVR also designed to test systematic generalization. We demonstrate superior performance in recovering ground-truth compositional program structure with limited supervision on both SHAPES-SyGeT and CLEVR.",
+    "title": "Iterated learning for emergent systematicity in VQA",
+    "authors": [
+      "Ankit Vani",
+      "Max Schwarzer",
+      "Yuchen Lu",
+      "Eeshan Dhekane",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Max_Schwarzer1",
+      "~Ankit_Vani1",
+      "gmail.com",
+      "~Yuchen_Lu1",
+      "~Aaron_Courville3"
+    ],
+    "rank": 223
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ov_sMNau-PF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.89",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Extracting semantically useful natural language sentence representations from pre-trained deep neural networks such as Transformers remains a challenge. We first demonstrate that pre-training objectives impose a significant task bias onto the final layers of models with a layer-wise survey of the Semantic Textual Similarity (STS) correlations for multiple common Transformer language models. We then propose a new self-supervised method called Contrastive Tension (CT) to counter such biases. CT frames the training objective as a noise-contrastive task between the final layer representations of two independent models, in turn making the final layer representations suitable for feature extraction. Results from multiple common unsupervised and supervised STS tasks indicate that CT outperforms previous State Of The Art (SOTA), and when combining CT with supervised data we improve upon previous SOTA results with large margins. ",
+    "title": "Semantic Re-tuning with Contrastive Tension",
+    "authors": [
+      "Fredrik Carlsson",
+      "Amaru Cuba Gyllensten",
+      "Evangelia Gogoulou",
+      "Erik Ylip\u00e4\u00e4 Hellqvist",
+      "Magnus Sahlgren"
+    ],
+    "emails": [
+      "~Evangelia_Gogoulou1",
+      "~Erik_Ylip\u00e4\u00e4_Hellqvist1",
+      "~Fredrik_Carlsson1",
+      "~Amaru_Cuba_Gyllensten2",
+      "~Magnus_Sahlgren1"
+    ],
+    "rank": 224
+  },
+  {
+    "url": "https://openreview.net/forum?id=AAes_3W-2z",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      8
+    ],
+    "rating": "6.88",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We present Wasserstein Embedding for Graph Learning (WEGL), a novel and fast framework for embedding entire graphs in a vector space, in which various machine learning models are applicable for graph-level prediction tasks. We leverage new insights on defining similarity between graphs as a function of the similarity between their node embedding distributions. Specifically, we use the Wasserstein distance to measure the dissimilarity between node embeddings of different graphs. Unlike prior work, we avoid pairwise calculation of distances between graphs and reduce the computational complexity from quadratic to linear in the number of graphs. WEGL calculates Monge maps from a reference distribution to each node embedding and, based on these maps, creates a fixed-sized vector representation of the graph. We evaluate our new graph embedding approach on various benchmark graph-property prediction tasks, showing state-of-the-art classification performance while having superior computational efficiency. The code is available at <a href=\"https://github.com/navid-naderi/WEGL\" target=\"_blank\" rel=\"nofollow\">https://github.com/navid-naderi/WEGL</a>.",
+    "title": "Wasserstein Embedding for Graph Learning",
+    "authors": [
+      "Soheil Kolouri",
+      "Navid Naderializadeh",
+      "Gustavo K. Rohde",
+      "Heiko Hoffmann"
+    ],
+    "emails": [
+      "hrl.com",
+      "~Soheil_Kolouri1",
+      "~Gustavo_K._Rohde1"
+    ],
+    "rank": 225
+  },
+  {
+    "url": "https://openreview.net/forum?id=tIjRAiFmU3y",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.88",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Designing an unsupervised image denoising approach in practical applications is a challenging task due to the complicated data acquisition process. In the real-world case, the noise distribution is so complex that the simplified additive white Gaussian (AWGN) assumption rarely holds, which significantly deteriorates the Gaussian denoisers' performance. To address this problem, we apply a deep neural network that maps the noisy image into a latent space in which the AWGN assumption holds, and thus any existing Gaussian denoiser is applicable. More specifically, the proposed neural network consists of the encoder-decoder structure and approximates the likelihood term in the Bayesian framework. Together with a Gaussian denoiser, the neural network can be trained with the input image itself and does not require any pre-training in other datasets. Extensive experiments on real-world noisy image datasets have shown that the combination of neural networks and Gaussian denoisers improves the performance of the original Gaussian denoisers by a large margin. In particular, the neural network+BM3D method significantly outperforms other unsupervised denoising approaches and is competitive with supervised networks such as DnCNN, FFDNet, and CBDNet.",
+    "title": "An Unsupervised Deep Learning Approach for Real-World Image Denoising",
+    "authors": [
+      "Dihan Zheng",
+      "Sia Huat Tan",
+      "Xiaowen Zhang",
+      "Zuoqiang Shi",
+      "Kaisheng Ma",
+      "Chenglong Bao"
+    ],
+    "emails": [
+      "~Chenglong_Bao3",
+      "~Dihan_Zheng1",
+      "~Kaisheng_Ma1",
+      "~Xiaowen_Zhang2",
+      "~Sia_Huat_Tan1",
+      "~Zuoqiang_Shi1"
+    ],
+    "rank": 226
+  },
+  {
+    "url": "https://openreview.net/forum?id=ADWd4TJO13G",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6,
+      9
+    ],
+    "rating": "6.88",
+    "confidences": [
+      3,
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "A hallmark of human intelligence is the ability to construct self-contained chunks of knowledge and adequately reuse them in novel combinations for solving different yet structurally related problems. Learning such compositional structures has been a significant challenge for artificial systems, due to the combinatorial nature of the underlying search problem. To date, research into compositional learning has largely proceeded separately from work on lifelong or continual learning. We integrate these two lines of work to present a general-purpose framework for lifelong learning of compositional structures that can be used for solving a stream of related tasks. Our framework separates the learning process into two broad stages: learning how to best combine existing components in order to assimilate a novel problem, and learning how to adapt the set of existing components to accommodate the new problem. This separation explicitly handles the trade-off between the stability required to remember how to solve earlier tasks and the flexibility required to solve new tasks, as we show empirically in an extensive evaluation.",
+    "title": "Lifelong Learning of Compositional Structures",
+    "authors": [
+      "Jorge A Mendez",
+      "ERIC EATON"
+    ],
+    "emails": [
+      "~ERIC_EATON1",
+      "~Jorge_A_Mendez1"
+    ],
+    "rank": 227
+  },
+  {
+    "url": "https://openreview.net/forum?id=Jacdvfjicf7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.87",
+    "confidences": [
+      4,
+      5,
+      5,
+      1
+    ],
+    "abstract": "This paper aims to understand and improve the utility of the dropout operation from the perspective of game-theoretical interactions. We prove that dropout can suppress the strength of interactions between input variables of deep neural networks (DNNs). The theoretical proof is also verified by various experiments. Furthermore, we find that such interactions were strongly related to the over-fitting problem in deep learning. So, the utility of dropout can be regarded as decreasing interactions to alleviating the significance of over-fitting. Based on this understanding, we propose the interaction loss to further improve the utility of dropout. Experimental results on various DNNs and datasets have shown that the interaction loss can effectively improve the utility of dropout and boost the performance of DNNs.",
+    "title": "Interpreting and Boosting Dropout from a Game-Theoretic View",
+    "authors": [
+      "Hao Zhang",
+      "Sen Li",
+      "YinChao Ma",
+      "Mingjie Li",
+      "Yichen Xie",
+      "Quanshi Zhang"
+    ],
+    "emails": [
+      "~Hao_Zhang22",
+      "~YinChao_Ma1",
+      "~Quanshi_Zhang1",
+      "~Sen_Li2",
+      "~Yichen_Xie1",
+      "~Mingjie_Li3"
+    ],
+    "rank": 228
+  },
+  {
+    "url": "https://openreview.net/forum?id=mEdwVCRJuX4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      9
+    ],
+    "rating": "6.87",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Real-world large-scale datasets are heteroskedastic and imbalanced --- labels have varying levels of uncertainty and label distributions are long-tailed. Heteroskedasticity and imbalance challenge deep learning algorithms due to the difficulty of distinguishing among mislabeled, ambiguous, and rare examples. Addressing heteroskedasticity and imbalance simultaneously is under-explored. We propose a data-dependent regularization technique for heteroskedastic datasets that regularizes different regions of the input space differently. Inspired by the theoretical derivation of the optimal regularization strength in a one-dimensional nonparametric classification setting, our approach adaptively regularizes the data points in higher-uncertainty, lower-density regions more heavily. We test our method on several benchmark tasks, including a real-world heteroskedastic and imbalanced dataset, WebVision. Our experiments corroborate our theory and demonstrate a significant improvement over other methods in noise-robust deep learning. ",
+    "title": "Heteroskedastic and Imbalanced Deep Learning with Adaptive Regularization",
+    "authors": [
+      "Kaidi Cao",
+      "Yining Chen",
+      "Junwei Lu",
+      "Nikos Arechiga",
+      "Adrien Gaidon",
+      "Tengyu Ma"
+    ],
+    "emails": [
+      "~Kaidi_Cao1",
+      "~Adrien_Gaidon1",
+      "tri.global",
+      "~Junwei_Lu1",
+      "~Yining_Chen1",
+      "~Tengyu_Ma1"
+    ],
+    "rank": 229
+  },
+  {
+    "url": "https://openreview.net/forum?id=R2ZlTVPx0Gk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      8
+    ],
+    "rating": "6.86",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Deep ensembles perform better than a single network thanks to the diversity among their members. Recent approaches regularize predictions to increase diversity; however, they also drastically decrease individual members\u2019 performances. In this paper, we argue that learning strategies for deep ensembles need to tackle the trade-off between ensemble diversity and individual accuracies. Motivated by arguments from information theory and leveraging recent advances in neural estimation of conditional mutual information, we introduce a novel training criterion called DICE: it increases diversity by reducing spurious correlations among features. The main idea is that features extracted from pairs of members should only share information useful for target class prediction without being conditionally redundant. Therefore, besides the classification loss with information bottleneck, we adversarially prevent features from being conditionally predictable from each other. We manage to reduce simultaneous errors while protecting class information. We obtain state-of-the-art accuracy results on CIFAR-10/100: for example, an ensemble of 5 networks trained with DICE matches an ensemble of 7 networks trained independently. We further analyze the consequences on calibration, uncertainty estimation, out-of-distribution detection and online co-distillation.",
+    "title": "DICE: Diversity in Deep Ensembles via Conditional Redundancy Adversarial Estimation",
+    "authors": [
+      "Alexandre Rame",
+      "Matthieu Cord"
+    ],
+    "emails": [
+      "~Alexandre_Rame1",
+      "~Matthieu_Cord1"
+    ],
+    "rank": 230
+  },
+  {
+    "url": "https://openreview.net/forum?id=F1vEjWK-lH_",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.86",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Massively multilingual models subsuming tens or even hundreds of languages pose great challenges to multi-task optimization. While it is a common practice to apply a language-agnostic procedure optimizing a joint multilingual task objective, how to properly characterize and take advantage of its underlying problem structure for improving optimization efficiency remains under-explored. In this paper, we attempt to peek into the black-box of multilingual optimization through the lens of loss function geometry. We find that gradient similarity measured along the optimization trajectory is an important signal, which correlates well with not only language proximity but also the overall model performance. Such observation helps us to identify a critical limitation of existing gradient-based multi-task learning methods, and thus we derive a simple and scalable optimization procedure, named Gradient Vaccine, which encourages more geometrically aligned parameter updates for close tasks. Empirically, our method obtains significant model performance gains on multilingual machine translation and XTREME benchmark tasks for multilingual language models. Our work reveals the importance of properly measuring and utilizing language proximity in multilingual optimization, and has broader implications for multi-task learning beyond multilingual modeling.",
+    "title": "Gradient Vaccine: Investigating and Improving Multi-task Optimization in Massively Multilingual Models",
+    "authors": [
+      "Zirui Wang",
+      "Yulia Tsvetkov",
+      "Orhan Firat",
+      "Yuan Cao"
+    ],
+    "emails": [
+      "~Orhan_Firat1",
+      "~Yulia_Tsvetkov1",
+      "~Yuan_Cao2",
+      "~Zirui_Wang1"
+    ],
+    "rank": 231
+  },
+  {
+    "url": "https://openreview.net/forum?id=N0M_4BkQ05i",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      8,
+      7
+    ],
+    "rating": "6.86",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Selective classification, in which models can abstain on uncertain predictions, is a natural approach to improving accuracy in settings where errors are costly but abstentions are manageable. In this paper, we find that while selective classification can improve average accuracies, it can simultaneously magnify existing accuracy disparities between various groups within a population, especially in the presence of spurious correlations. We observe this behavior consistently across five vision and NLP datasets. Surprisingly, increasing abstentions can even decrease accuracies on some groups. To better understand this phenomenon, we study the margin distribution, which captures the model\u2019s confidences over all predictions. For symmetric margin distributions, we prove that whether selective classification monotonically improves or worsens accuracy is fully determined by the accuracy at full coverage (i.e., without any abstentions) and whether the distribution satisfies a property we call left-log-concavity. Our analysis also shows that selective classification tends to magnify full-coverage accuracy disparities. Motivated by our analysis, we train distributionally-robust models that achieve similar full-coverage accuracies across groups and show that selective classification uniformly improves each group on these models. Altogether, our results suggest that selective classification should be used with care and underscore the importance of training models to perform equally well across groups at full coverage.",
+    "title": "Selective Classification Can Magnify Disparities Across Groups",
+    "authors": [
+      "Erik Jones",
+      "Shiori Sagawa",
+      "Pang Wei Koh",
+      "Ananya Kumar",
+      "Percy Liang"
+    ],
+    "emails": [
+      "~Percy_Liang1",
+      "~Pang_Wei_Koh1",
+      "~Ananya_Kumar1",
+      "~Shiori_Sagawa1",
+      "~Erik_Jones3"
+    ],
+    "rank": 232
+  },
+  {
+    "url": "https://openreview.net/forum?id=vK9WrZ0QYQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      8
+    ],
+    "rating": "6.86",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We prove that the reproducing kernel Hilbert spaces (RKHS) of a deep neural tangent kernel and the Laplace kernel include the same set of functions, when both kernels are restricted to the sphere <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c1D54A TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.429em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">S</mi></mrow><mrow><mi>d</mi><mo>\u2212</mo><mn>1</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>. Additionally, we prove that the exponential power kernel with a smaller power (making the kernel less smooth) leads to a larger RKHS, when it is restricted to the sphere <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c1D54A TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.429em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">S</mi></mrow><mrow><mi>d</mi><mo>\u2212</mo><mn>1</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> and when it is defined on the entire <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup></math></mjx-assistive-mml></mjx-container>.",
+    "title": "Deep Neural Tangent Kernel and Laplace Kernel Have the Same RKHS",
+    "authors": [
+      "Lin Chen",
+      "Sheng Xu"
+    ],
+    "emails": [
+      "~Sheng_Xu5",
+      "~Lin_Chen14"
+    ],
+    "rank": 233
+  },
+  {
+    "url": "https://openreview.net/forum?id=ULQdiUTHe3y",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      8
+    ],
+    "rating": "6.86",
+    "confidences": [
+      1,
+      3,
+      1,
+      2
+    ],
+    "abstract": "In tasks like node classification, image segmentation, and named-entity recognition we have a classifier that simultaneously outputs multiple predictions (a vector of labels) based on a single input, i.e. a single graph, image, or document respectively. Existing adversarial robustness certificates consider each prediction independently and are thus overly pessimistic for such tasks. They implicitly assume that an adversary can use different perturbed inputs to attack different predictions, ignoring the fact that we have a single shared input. We propose the first collective robustness certificate which computes the number of predictions that are simultaneously guaranteed to remain stable under perturbation, i.e. cannot be attacked. We focus on Graph Neural Networks and leverage their locality property - perturbations only affect the predictions in a close neighborhood - to fuse multiple single-node certificates into a drastically stronger collective certificate. For example, on the Citeseer dataset our collective certificate for node classification increases the average number of certifiable feature perturbations from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>7</mn></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>351</mn></math></mjx-assistive-mml></mjx-container>.\n",
+    "title": "Collective Robustness Certificates: Exploiting Interdependence in Graph Neural Networks",
+    "authors": [
+      "Jan Schuchardt",
+      "Aleksandar Bojchevski",
+      "Johannes Klicpera",
+      "Stephan G\u00fcnnemann"
+    ],
+    "emails": [
+      "~Aleksandar_Bojchevski1",
+      "~Johannes_Klicpera1",
+      "~Stephan_G\u00fcnnemann1",
+      "~Jan_Schuchardt1"
+    ],
+    "rank": 234
+  },
+  {
+    "url": "https://openreview.net/forum?id=QHUUrieaqai",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.86",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "While designing inductive bias in neural architectures has been widely studied, we hypothesize that transformer networks are flexible enough to learn inductive bias from suitable generic tasks. Here, we replace architecture engineering by encoding inductive bias in the form of datasets. Inspired by Peirce's view that deduction, induction, and abduction form an irreducible set of reasoning primitives, we design three synthetic tasks that are intended to require the model to have these three abilities. We specifically design these synthetic tasks in a way that they are devoid of mathematical knowledge to ensure that only the fundamental reasoning biases can be learned from these tasks. This defines a new pre-training methodology called ``\"LIME\" (Learning Inductive bias for Mathematical rEasoning). Models trained with LIME significantly outperform vanilla transformers on three very different large mathematical reasoning benchmarks. Unlike dominating the computation cost as traditional pre-training approaches, LIME requires only a small fraction of the computation cost of the typical downstream task.",
+    "title": "LIME: Learning Inductive Bias for Primitives of Mathematical Reasoning",
+    "authors": [
+      "Yuhuai Wu",
+      "Markus Norman Rabe",
+      "Wenda Li",
+      "Jimmy Ba",
+      "Roger Baker Grosse",
+      "Christian Szegedy"
+    ],
+    "emails": [
+      "~Markus_Norman_Rabe1",
+      "~Wenda_Li1",
+      "~Yuhuai_Wu1",
+      "~Christian_Szegedy1",
+      "~Roger_Baker_Grosse1",
+      "~Jimmy_Ba1"
+    ],
+    "rank": 235
+  },
+  {
+    "url": "https://openreview.net/forum?id=jrA5GAccy_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.85",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Recently, invariant risk minimization (IRM) was proposed as a promising solution to address out-of-distribution (OOD) generalization. However, it is unclear when IRM should be preferred over the widely-employed empirical risk minimization (ERM) framework. In this work, we analyze both these frameworks from the perspective of sample complexity, thus taking a firm step towards answering this important question. We find that depending on the type of data generation mechanism, the two approaches might have very different finite sample and asymptotic behavior. For example, in the covariate shift setting we see that the two approaches not only arrive at the same asymptotic solution, but also have similar finite sample behavior with no clear winner. For other distribution shifts such as those involving confounders or anti-causal variables, however, the two approaches arrive at different asymptotic solutions where IRM is guaranteed to be close to the desired OOD solutions in the finite sample regime, while ERM is biased even asymptotically.  We further investigate how different factors --- the number of environments, complexity of the model, and IRM penalty weight ---  impact the sample complexity of IRM in relation to its distance from the OOD solutions. ",
+    "title": "Empirical or Invariant Risk Minimization? A Sample Complexity Perspective",
+    "authors": [
+      "Kartik Ahuja",
+      "Jun Wang",
+      "Amit Dhurandhar",
+      "Karthikeyan Shanmugam",
+      "Kush R. Varshney"
+    ],
+    "emails": [
+      "~Kartik_Ahuja1",
+      "~Karthikeyan_Shanmugam1",
+      "~Kush_R._Varshney1",
+      "~Amit_Dhurandhar1",
+      "gmail.com"
+    ],
+    "rank": 236
+  },
+  {
+    "url": "https://openreview.net/forum?id=8Ln-Bq0mZcy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.85",
+    "confidences": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Humans can quickly adapt to new partners in collaborative tasks (e.g. playing basketball), because they understand which fundamental skills of the task (e.g. how to dribble, how to shoot) carry over across new partners. Humans can also quickly adapt to similar tasks with the same partners by carrying over conventions that they have developed (e.g. raising hand signals pass the ball), without learning to coordinate from scratch. To collaborate seamlessly with humans, AI agents should adapt quickly to new partners and new tasks as well. However, current approaches have not attempted to distinguish between the complexities intrinsic to a task and the conventions used by a partner, and more generally there has been little focus on leveraging conventions for adapting to new settings. In this work, we propose a learning framework that teases apart rule-dependent representation from convention-dependent representation in a principled way. We show that, under some assumptions, our rule-dependent representation is a sufficient statistic of the distribution over best-response strategies across partners. Using this separation of representations, our agents are able to adapt quickly to new partners, and to coordinate with old partners on new tasks in a zero-shot manner. We experimentally validate our approach on three collaborative tasks varying in complexity: a contextual multi-armed bandit, a block placing task, and the card game Hanabi.",
+    "title": "On the Critical Role of Conventions in Adaptive Human-AI Collaboration",
+    "authors": [
+      "Andy Shih",
+      "Arjun Sawhney",
+      "Jovana Kondic",
+      "Stefano Ermon",
+      "Dorsa Sadigh"
+    ],
+    "emails": [
+      "~Arjun_Sawhney1",
+      "~Andy_Shih1",
+      "~Jovana_Kondic1",
+      "~Dorsa_Sadigh1",
+      "~Stefano_Ermon1"
+    ],
+    "rank": 237
+  },
+  {
+    "url": "https://openreview.net/forum?id=fmOOI2a3tQP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.83",
+    "confidences": [
+      3,
+      4,
+      2,
+      3
+    ],
+    "abstract": "Many control tasks exhibit similar dynamics that can be modeled as having common latent structure. Hidden-Parameter Markov Decision Processes (HiP-MDPs) explicitly model this structure to improve sample efficiency in multi-task settings.\nHowever, this setting makes strong assumptions on the observability of the state that limit its application in real-world scenarios with rich observation spaces.  In this work, we leverage ideas of common structure from the HiP-MDP setting, and extend it to enable robust state abstractions inspired by Block MDPs. We  derive instantiations of this new framework for  both multi-task reinforcement learning (MTRL) and  meta-reinforcement learning (Meta-RL) settings. Further, we provide transfer and generalization bounds based on task and state similarity, along with sample complexity bounds that depend on the aggregate number of samples across tasks, rather than the number of tasks, a significant improvement over prior work. To further demonstrate efficacy of the proposed method, we empirically compare and show improvement over multi-task and meta-reinforcement learning baselines.",
+    "title": "Learning Robust State Abstractions for Hidden-Parameter Block MDPs",
+    "authors": [
+      "Amy Zhang",
+      "Shagun Sodhani",
+      "Khimya Khetarpal",
+      "Joelle Pineau"
+    ],
+    "emails": [
+      "~Khimya_Khetarpal1",
+      "~Joelle_Pineau1",
+      "~Shagun_Sodhani1",
+      "~Amy_Zhang1"
+    ],
+    "rank": 238
+  },
+  {
+    "url": "https://openreview.net/forum?id=04cII6MumYV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.83",
+    "confidences": [
+      5,
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Few-shot classification aims to recognize unseen classes when presented with only a small number of samples. We consider the problem of multi-domain few-shot image classification, where unseen classes and examples come from diverse data sources. This problem has seen growing interest and has inspired the development of benchmarks such as Meta-Dataset. A key challenge in this multi-domain setting is to effectively integrate the feature representations from the diverse set of training domains. Here, we propose a Universal Representation Transformer (URT) layer, that meta-learns to leverage universal features for few-shot classification by dynamically re-weighting and composing the most appropriate domain-specific representations. In experiments, we show that URT sets a new state-of-the-art result on Meta-Dataset. Specifically, it achieves top-performance on the highest number of data sources compared to competing methods. We analyze variants of URT and present a visualization of the attention score heatmaps that sheds light on how the model performs cross-domain generalization.",
+    "title": "A Universal Representation Transformer Layer for Few-Shot Image Classification",
+    "authors": [
+      "Lu Liu",
+      "William L. Hamilton",
+      "Guodong Long",
+      "Jing Jiang",
+      "Hugo Larochelle"
+    ],
+    "emails": [
+      "~Hugo_Larochelle1",
+      "~Guodong_Long2",
+      "~Lu_Liu7",
+      "~Jing_Jiang6",
+      "~William_L._Hamilton1"
+    ],
+    "rank": 239
+  },
+  {
+    "url": "https://openreview.net/forum?id=O6LPudowNQm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.83",
+    "confidences": [
+      4,
+      2,
+      4,
+      2
+    ],
+    "abstract": "In learning-assisted theorem proving, one of the most critical challenges is to generalize to theorems unlike those seen at training time. In this paper, we introduce INT, an INequality Theorem proving benchmark designed to test agents\u2019 generalization ability. INT is based on a theorem generator, which provides theoretically infinite data and allows us to measure 6 different types of generalization, each reflecting a distinct challenge, characteristic of automated theorem proving. In addition, provides a fast theorem proving environment with sequence-based and graph-based interfaces, conducive to performing learning-based research. We introduce base-lines with architectures including transformers and graph neural networks (GNNs)for INT. Using INT, we find that transformer-based agents achieve stronger test performance for most of the generalization tasks, despite having much larger out-of-distribution generalization gaps than GNNs. We further find that the addition of Monte Carlo Tree Search (MCTS) at test time helps to prove new theorems.",
+    "title": "INT: An Inequality Benchmark for Evaluating Generalization in Theorem Proving",
+    "authors": [
+      "Yuhuai Wu",
+      "Albert Jiang",
+      "Jimmy Ba",
+      "Roger Baker Grosse"
+    ],
+    "emails": [
+      "~Roger_Baker_Grosse1",
+      "~Jimmy_Ba1",
+      "~Albert_Jiang1",
+      "~Yuhuai_Wu1"
+    ],
+    "rank": 240
+  },
+  {
+    "url": "https://openreview.net/forum?id=JkfYjnOEo6M",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.82",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "We provide a general self-attention formulation to impose group equivariance to arbitrary symmetry groups. This is achieved by defining positional encodings that are invariant to the action of the group considered. Since the group acts on the positional encoding directly, group equivariant self-attention networks (GSA-Nets) are steerable by nature. Our experiments on vision benchmarks demonstrate consistent improvements of GSA-Nets over non-equivariant self-attention networks.",
+    "title": "Group Equivariant Stand-Alone Self-Attention For Vision",
+    "authors": [
+      "David W. Romero",
+      "Jean-Baptiste Cordonnier"
+    ],
+    "emails": [
+      "~Jean-Baptiste_Cordonnier2",
+      "~David_W._Romero1"
+    ],
+    "rank": 241
+  },
+  {
+    "url": "https://openreview.net/forum?id=jh-rTtvkGeM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      8,
+      8
+    ],
+    "rating": "6.82",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We empirically demonstrate that full-batch gradient descent on neural network training objectives typically operates in a regime we call the Edge of Stability. In this regime, the maximum eigenvalue of the training loss Hessian hovers just above the value <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c><mjx-c class=\"mjx-c73\"></mjx-c><mjx-c class=\"mjx-c74\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c><mjx-c class=\"mjx-c70\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c73\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c7A\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2</mn><mrow><mo>/</mo></mrow><mtext>(step size)</mtext></math></mjx-assistive-mml></mjx-container>, and the training loss behaves non-monotonically over short timescales, yet consistently decreases over long timescales. Since this behavior is inconsistent with several widespread presumptions in the field of optimization, our findings raise questions as to whether these presumptions are relevant to neural network training. We hope that our findings will inspire future efforts aimed at rigorously understanding optimization at the Edge of Stability.",
+    "title": "Gradient Descent on Neural Networks Typically Occurs at the Edge of Stability",
+    "authors": [
+      "Jeremy Cohen",
+      "Simran Kaur",
+      "Yuanzhi Li",
+      "J Zico Kolter",
+      "Ameet Talwalkar"
+    ],
+    "emails": [
+      "andrew.cmu.edu",
+      "~Yuanzhi_Li1",
+      "~Jeremy_Cohen1",
+      "~Ameet_Talwalkar1",
+      "~J_Zico_Kolter1"
+    ],
+    "rank": 242
+  },
+  {
+    "url": "https://openreview.net/forum?id=HgLO8yalfwc",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.82",
+    "confidences": [
+      3,
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Inverse Reinforcement Learning (IRL) aims to facilitate a learner\u2019s ability to imitate expert behavior by acquiring reward functions that explain the expert\u2019s decisions. Regularized IRLapplies strongly convex regularizers to the learner\u2019s policy in order to avoid the expert\u2019s behavior being rationalized by arbitrary constant rewards, also known as degenerate solutions. We propose tractable solutions, and practical methods to obtain them, for regularized IRL. Current methods are restricted to the maximum-entropy IRL framework, limiting them to Shannon-entropy regularizers, as well as proposing solutions that are intractable in practice.  We present theoretical backing for our proposed IRL method\u2019s applicability to both discrete and continuous controls, empirically validating our performance on a variety of tasks.",
+    "title": "Regularized Inverse Reinforcement Learning",
+    "authors": [
+      "Wonseok Jeon",
+      "Chen-Yang Su",
+      "Paul Barde",
+      "Thang Doan",
+      "Derek Nowrouzezahrai",
+      "Joelle Pineau"
+    ],
+    "emails": [
+      "~Derek_Nowrouzezahrai1",
+      "~Thang_Doan1",
+      "~Paul_Barde1",
+      "~Joelle_Pineau1",
+      "~Wonseok_Jeon1",
+      "~Chen-Yang_Su1"
+    ],
+    "rank": 243
+  },
+  {
+    "url": "https://openreview.net/forum?id=blfSjHeFM_e",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.82",
+    "confidences": [
+      2,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Neural ordinary differential equations (Neural ODEs) are a new family of deep-learning models with continuous depth. However, the numerical estimation of the gradient in the continuous case is not well solved: existing implementations of the adjoint method suffer from inaccuracy in reverse-time trajectory, while the naive method and the adaptive checkpoint adjoint method (ACA) have a memory cost that grows with integration time. In this project, based on the asynchronous leapfrog (ALF) solver, we propose the Memory-efficient ALF Integrator (MALI), which has a constant memory cost <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>w</mi><mo>.</mo><mi>r</mi><mo>.</mo><mi>t</mi></math></mjx-assistive-mml></mjx-container> integration time similar to the adjoint method, and guarantees accuracy in reverse-time trajectory (hence accuracy in gradient estimation). We validate MALI in various tasks: on image recognition tasks, to our knowledge, MALI is the first to enable feasible training of a Neural ODE on ImageNet and outperform a well-tuned ResNet, while existing methods fail due to either heavy memory burden or inaccuracy; for time series modeling, MALI significantly outperforms the adjoint method; and for continuous generative models, MALI achieves new state-of-the-art performance. We provide a pypi package: <a href=\"https://jzkay12.github.io/TorchDiffEqPack\" target=\"_blank\" rel=\"nofollow\">https://jzkay12.github.io/TorchDiffEqPack</a>",
+    "title": "MALI: A memory efficient and reverse accurate integrator for Neural ODEs",
+    "authors": [
+      "Juntang Zhuang",
+      "Nicha C Dvornek",
+      "sekhar tatikonda",
+      "James s Duncan"
+    ],
+    "emails": [
+      "~Nicha_C_Dvornek1",
+      "~James_s_Duncan1",
+      "~Juntang_Zhuang1",
+      "~sekhar_tatikonda1"
+    ],
+    "rank": 244
+  },
+  {
+    "url": "https://openreview.net/forum?id=42kiJ7n_8xO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      8,
+      5
+    ],
+    "rating": "6.81",
+    "confidences": [
+      4,
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Despite the widespread application of recurrent neural networks (RNNs), a unified understanding of how RNNs solve particular tasks remains elusive.  In particular, it is unclear what dynamical patterns arise in trained RNNs, and how those pat-terns depend on the training dataset or task.  This work addresses these questions in the context of text classification, building on earlier work studying the dynamics of binary sentiment-classification networks (Maheswaranathan et al., 2019).  We study text-classification tasks beyond the binary case, exploring the dynamics ofRNNs trained on both natural and synthetic datasets.  These dynamics, which we find to be both interpretable and low-dimensional, share a common mechanism across architectures and datasets:  specifically, these text-classification networks use low-dimensional attractor manifolds to accumulate evidence for each class as they process the text.  The dimensionality and geometry of the attractor manifold are determined by the structure of the training dataset, with the dimensionality reflecting the number of scalar quantities the network remembers in order to classify.In categorical classification, for example, we show that this dimensionality is one less than the number of classes. Correlations in the dataset, such as those induced by ordering, can further reduce the dimensionality of the attractor manifold; we show how to predict this reduction using simple word-count statistics computed on the training dataset. To the degree that integration of evidence towards a decision is a common computational primitive, this work continues to lay the foundation for using dynamical systems techniques to study the inner workings of RNNs.",
+    "title": "The geometry of integration in text classification RNNs",
+    "authors": [
+      "Kyle Aitken",
+      "Vinay Venkatesh Ramasesh",
+      "Ankush Garg",
+      "Yuan Cao",
+      "David Sussillo",
+      "Niru Maheswaranathan"
+    ],
+    "emails": [
+      "~Kyle_Aitken1",
+      "~Ankush_Garg1",
+      "~Yuan_Cao2",
+      "~Niru_Maheswaranathan1",
+      "~David_Sussillo1",
+      "~Vinay_Venkatesh_Ramasesh1"
+    ],
+    "rank": 245
+  },
+  {
+    "url": "https://openreview.net/forum?id=EMHoBG0avc1",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      9
+    ],
+    "rating": "6.81",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "We propose a simple and efficient multi-hop dense retrieval approach for answering complex open-domain questions, which achieves state-of-the-art performance on two multi-hop datasets, HotpotQA and multi-evidence FEVER. Contrary to previous work, our method does not require access to any corpus-specific information, such as inter-document hyperlinks or human-annotated entity markers, and can be applied to any unstructured text corpus. Our system also yields a much better efficiency-accuracy trade-off, matching the best published accuracy on HotpotQA while being 10 times faster at inference time.",
+    "title": "Answering Complex Open-Domain Questions with Multi-Hop Dense Retrieval",
+    "authors": [
+      "Wenhan Xiong",
+      "Xiang Li",
+      "Srini Iyer",
+      "Jingfei Du",
+      "Patrick Lewis",
+      "William Yang Wang",
+      "Yashar Mehdad",
+      "Scott Yih",
+      "Sebastian Riedel",
+      "Douwe Kiela",
+      "Barlas Oguz"
+    ],
+    "emails": [
+      "~Xiang_Li2",
+      "~Jingfei_Du1",
+      "~Wenhan_Xiong1",
+      "~Douwe_Kiela1",
+      "~Barlas_Oguz1",
+      "~Patrick_Lewis2",
+      "~William_Yang_Wang2",
+      "~Sebastian_Riedel1",
+      "~Scott_Yih1",
+      "fb.com",
+      "~Srini_Iyer1"
+    ],
+    "rank": 246
+  },
+  {
+    "url": "https://openreview.net/forum?id=Zbc-ue9p_rE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.81",
+    "confidences": [
+      3,
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Deep generative modeling has seen impressive advances in recent years, to the point where it is now commonplace to see simulated samples (e.g., images) that closely resemble real-world data. However, generation quality is generally inconsistent for any given model and can vary dramatically between samples. We introduce Discriminator Gradient <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>low (DG<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>low), a new technique that improves generated samples via the gradient flow of entropy-regularized <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>-divergences between the real and the generated data distributions. The gradient flow takes the form of a non-linear Fokker-Plank equation, which can be easily simulated by sampling from the equivalent McKean-Vlasov process. By refining inferior samples, our technique avoids wasteful sample rejection used by previous methods (DRS &amp; MH-GAN). Compared to existing works that focus on specific GAN variants, we show our refinement approach can be applied to GANs with vector-valued critics and even other deep generative models such as VAEs and Normalizing Flows. Empirical results on multiple synthetic, image, and text datasets demonstrate that DG<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>low leads to significant improvement in the quality of generated samples for a variety of generative models, outperforming the state-of-the-art Discriminator Optimal Transport (DOT) and Discriminator Driven Latent Sampling (DDLS) methods.",
+    "title": "Refining Deep Generative Models via Discriminator Gradient Flow",
+    "authors": [
+      "Abdul Fatir Ansari",
+      "Ming Liang Ang",
+      "Harold Soh"
+    ],
+    "emails": [
+      "~Harold_Soh1",
+      "~Abdul_Fatir_Ansari2",
+      "~Ming_Liang_Ang4"
+    ],
+    "rank": 247
+  },
+  {
+    "url": "https://openreview.net/forum?id=MaZFq7bJif7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      8
+    ],
+    "rating": "6.80",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper considers the problem of spatiotemporal object-centric reasoning in videos. Central to our approach is the notion of object permanence, i.e., the ability to reason about the location of objects as they move through the video while being occluded, contained or carried by other objects. Existing deep learning based approaches often suffer from spatiotemporal biases when applied to video reasoning problems. We propose Hopper, which uses a Multi-hop Transformer for reasoning object permanence in videos. Given a video and a localization query, Hopper reasons over image and object tracks to automatically hop over critical frames in an iterative fashion to predict the final position of the object of interest. We demonstrate the effectiveness of using a contrastive loss to reduce spatiotemporal biases. We evaluate over CATER dataset and find that Hopper achieves 73.2% Top-1 accuracy using just 1 FPS by hopping through just a few critical frames. We also demonstrate Hopper can perform long-term reasoning by building a CATER-h dataset that requires multi-step reasoning to localize objects of interest correctly.",
+    "title": "Hopper: Multi-hop Transformer for Spatiotemporal Reasoning",
+    "authors": [
+      "Honglu Zhou",
+      "Asim Kadav",
+      "Farley Lai",
+      "Alexandru Niculescu-Mizil",
+      "Martin Renqiang Min",
+      "Mubbasir Kapadia",
+      "Hans Peter Graf"
+    ],
+    "emails": [
+      "~Hans_Peter_Graf1",
+      "~Martin_Renqiang_Min1",
+      "~Alexandru_Niculescu-Mizil1",
+      "~Mubbasir_Kapadia2",
+      "~Honglu_Zhou1",
+      "~Asim_Kadav1",
+      "~Farley_Lai1"
+    ],
+    "rank": 248
+  },
+  {
+    "url": "https://openreview.net/forum?id=_0kaDkv3dVf",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.80",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "HardWare-aware Neural Architecture Search (HW-NAS) has recently gained tremendous attention by automating the design of deep neural networks deployed in more resource-constrained daily life devices. Despite its promising performance, developing optimal HW-NAS solutions can be prohibitively challenging as it requires cross-disciplinary knowledge in the algorithm, micro-architecture, and device-specific compilation. First, to determine the hardware-cost to be incorporated into the NAS process, existing works mostly adopt either pre-collected hardware-cost look-up tables or device-specific hardware-cost models. The former can be time-consuming due to the required knowledge of the device\u2019s compilation method and how to set up the measurement pipeline, while building the latter is often a barrier for non-hardware experts like NAS researchers. Both of them limit the development of HW-NAS innovations and impose a barrier-to-entry to non-hardware experts. Second, similar to generic NAS, it can be notoriously difficult to benchmark HW-NAS algorithms due to their significant required computational resources and the differences in adopted search spaces, hyperparameters, and hardware devices. To this end, we develop HW-NAS-Bench, the first public dataset for HW-NAS research which aims to democratize HW-NAS research to non-hardware experts and make HW-NAS research more reproducible and accessible. To design HW-NAS-Bench, we carefully collected the measured/estimated hardware performance (e.g., energy cost and latency) of all the networks in the search spaces of both NAS-Bench-201 and FBNet, on six hardware devices that fall into three categories (i.e., commercial edge devices, FPGA, and ASIC). Furthermore, we provide a comprehensive analysis of the collected measurements in HW-NAS-Bench to provide insights for HW-NAS research. Finally, we demonstrate exemplary user cases to (1) show that HW-NAS-Bench allows non-hardware experts to perform HW-NAS by simply querying our pre-measured dataset and (2) verify that dedicated device-specific HW-NAS can indeed lead to optimal accuracy-cost trade-offs. The codes and all collected data are available at <a href=\"https://github.com/RICE-EIC/HW-NAS-Bench\" target=\"_blank\" rel=\"nofollow\">https://github.com/RICE-EIC/HW-NAS-Bench</a>.",
+    "title": "HW-NAS-Bench: Hardware-Aware Neural Architecture Search Benchmark",
+    "authors": [
+      "Chaojian Li",
+      "Zhongzhi Yu",
+      "Yonggan Fu",
+      "Yongan Zhang",
+      "Yang Zhao",
+      "Haoran You",
+      "Qixuan Yu",
+      "Yue Wang",
+      "Cong Hao",
+      "Yingyan Lin"
+    ],
+    "emails": [
+      "~Yingyan_Lin1",
+      "~Yue_Wang3",
+      "~Qixuan_Yu1",
+      "~Yonggan_Fu1",
+      "~Haoran_You1",
+      "~Zhongzhi_Yu1",
+      "gmail.com",
+      "~Chaojian_Li1",
+      "~Yang_Zhao1",
+      "~Yongan_Zhang1"
+    ],
+    "rank": 249
+  },
+  {
+    "url": "https://openreview.net/forum?id=fmtSg8591Q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.80",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Reinforcement learning (RL) in episodic, factored Markov decision processes (FMDPs) is studied. We propose an algorithm called FMDP-BF, which leverages the factorization structure of FMDP.  The regret of FMDP-BF is shown to be exponentially smaller than that of optimal algorithms designed for non-factored MDPs, and improves on the best previous result for FMDPs~\\citep{osband2014near} by a factor of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c221A TEX-S1\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.108em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c53 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo></mjx-box></mjx-sqrt></mjx-msqrt></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msqrt><mi>n</mi><mi>H</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msub><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">S</mi></mrow><mi>i</mi></msub><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo></msqrt></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c53 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msub><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">S</mi></mrow><mi>i</mi></msub><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo></math></mjx-assistive-mml></mjx-container> is the cardinality of the factored state subspace, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> is the planning horizon and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the number of factored transition. To show the optimality of our bounds, we also provide a lower bound for FMDP, which indicates that our algorithm is near-optimal w.r.t. timestep <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container>, horizon <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> and factored state-action subspace cardinality. Finally, as an application, we study a new formulation of constrained RL, known as RL with knapsack constraints (RLwK), and provides the first sample-efficient algorithm based on FMDP-BF.",
+    "title": "Efficient Reinforcement Learning in Factored MDPs with Application to Constrained RL",
+    "authors": [
+      "Xiaoyu Chen",
+      "Jiachen Hu",
+      "Lihong Li",
+      "Liwei Wang"
+    ],
+    "emails": [
+      "~Jiachen_Hu1",
+      "~Liwei_Wang1",
+      "~Xiaoyu_Chen2",
+      "~Lihong_Li1"
+    ],
+    "rank": 250
+  },
+  {
+    "url": "https://openreview.net/forum?id=ebS5NUfoMKL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      9,
+      5
+    ],
+    "rating": "6.80",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) are powerful models that have been successful in various graph representation learning tasks. Whereas gradient boosted decision trees (GBDT) often outperform other machine learning methods when faced with heterogeneous tabular data. But what approach should be used for graphs with tabular node features? Previous GNN models have mostly focused on networks with homogeneous sparse features and, as we show, are suboptimal in the heterogeneous setting. In this work, we propose a novel architecture that trains GBDT and GNN jointly to get the best of both worlds: the GBDT model deals with heterogeneous features, while GNN accounts for the graph structure. Our model benefits from end-to-end optimization by allowing new trees to fit the gradient updates of GNN. With an extensive experimental comparison to the leading GBDT and GNN models, we demonstrate a significant increase in performance on a variety of graphs with tabular features. The code is available: <a href=\"https://github.com/nd7141/bgnn\" target=\"_blank\" rel=\"nofollow\">https://github.com/nd7141/bgnn</a>.",
+    "title": "Boost then Convolve: Gradient Boosting Meets Graph Neural Networks",
+    "authors": [
+      "Sergei Ivanov",
+      "Liudmila Prokhorenkova"
+    ],
+    "emails": [
+      "~Sergei_Ivanov2",
+      "~Liudmila_Prokhorenkova1"
+    ],
+    "rank": 251
+  },
+  {
+    "url": "https://openreview.net/forum?id=kPheYCFm0Od",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      5,
+      8
+    ],
+    "rating": "6.80",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Multi-task learning aims to improve the overall performance of a set of tasks by leveraging their relatedness. When training data is limited using priors is pivotal, but currently this is done in ad-hoc ways. In this paper, we develop variational multi-task learning - VMTL, a general probabilistic inference framework for simultaneously learning multiple related tasks. We cast multi-task learning as a variational Bayesian inference problem, which enables task relatedness to be explored in a principled way by specifying priors. We introduce Gumbel-softmax priors to condition the prior of each task on related tasks. Each prior is represented as a mixture of variational posteriors of other related tasks and the mixing weights are learned in a data-driven manner for each individual task. The posteriors over representations and classifiers are inferred jointly for all tasks and individual tasks are able to improve their performance by using the shared inductive bias. Experimental results demonstrate that VMTL is able to tackle challenging multi-task learning with limited training data well, and it achieves state-of-the-art performance on four benchmarks, consistently surpassing previous methods.",
+    "title": "Variational Multi-Task Learning",
+    "authors": [
+      "Jiayi Shen",
+      "Xiantong Zhen",
+      "Marcel Worring",
+      "Ling Shao"
+    ],
+    "emails": [
+      "~Xiantong_Zhen1",
+      "~Marcel_Worring1",
+      "~Jiayi_Shen3",
+      "~Ling_Shao1"
+    ],
+    "rank": 252
+  },
+  {
+    "url": "https://openreview.net/forum?id=4dXmpCDGNp7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Feature based explanations, that provide importance of each feature towards the model prediction, is arguably one of the most intuitive ways to explain a model. In this paper, we establish a novel set of evaluation criteria for such feature based explanations by robustness analysis. In contrast to existing evaluations which require us to specify some way to \"remove\" features that could inevitably introduces biases and artifacts, we make use of the subtler notion of smaller adversarial perturbations. By optimizing towards our proposed evaluation criteria, we obtain new explanations that are loosely necessary and sufficient for a prediction. We further extend the explanation to extract the set of features that would move the current prediction to a target class by adopting targeted adversarial attack for the robustness analysis. Through experiments across multiple domains and a user study, we validate the usefulness of our evaluation criteria and our derived explanations.",
+    "title": "Evaluations and Methods for Explanation through Robustness Analysis",
+    "authors": [
+      "Cheng-Yu Hsieh",
+      "Chih-Kuan Yeh",
+      "Xuanqing Liu",
+      "Pradeep Kumar Ravikumar",
+      "Seungyeon Kim",
+      "Sanjiv Kumar",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Cheng-Yu_Hsieh1",
+      "~Cho-Jui_Hsieh1",
+      "~Pradeep_Kumar_Ravikumar1",
+      "~Seungyeon_Kim1",
+      "~Chih-Kuan_Yeh1",
+      "~Sanjiv_Kumar1",
+      "~Xuanqing_Liu1"
+    ],
+    "rank": 253
+  },
+  {
+    "url": "https://openreview.net/forum?id=6FtFPKw8aLj",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "There are many cluster similarity indices used to evaluate clustering algorithms, and choosing the best one for a particular task remains an open problem. We demonstrate that this problem is crucial: there are many disagreements among the indices, these disagreements do affect which algorithms are chosen in applications, and this can lead to degraded performance in real-world systems. We propose a theoretical solution to this problem: we develop a list of desirable properties and theoretically verify which indices satisfy them. This allows for making an informed choice: given a particular application, one can first make a selection of properties that are desirable for a given application and then identify indices satisfying these. We observe that many popular indices have significant drawbacks. Instead, we advocate using other ones that are not so widely adopted but have beneficial properties.",
+    "title": "Systematic Analysis of Cluster Similarity Indices: How to Validate Validation Measures",
+    "authors": [
+      "Martijn G\u00f6sgens",
+      "Liudmila Prokhorenkova",
+      "Aleksei Tikhonov"
+    ],
+    "emails": [
+      "~Aleksei_Tikhonov1",
+      "~Martijn_G\u00f6sgens1",
+      "~Liudmila_Prokhorenkova1"
+    ],
+    "rank": 254
+  },
+  {
+    "url": "https://openreview.net/forum?id=-GLNZeVDuik",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Despite their popularity, to date, the application of normalizing flows on categorical data stays limited. The current practice of using dequantization to map discrete data to a continuous space is inapplicable as categorical data has no intrinsic order. Instead, categorical data have complex and latent relations that must be inferred, like the synonymy between words. In this paper, we investigate Categorical Normalizing Flows, that is normalizing flows for categorical data. By casting the encoding of categorical data in continuous space as a variational inference problem, we jointly optimize the continuous representation and the model likelihood. Using a factorized decoder, we introduce an inductive bias to model any interactions in the normalizing flow. As a consequence, we do not only simplify the optimization compared to having a joint decoder, but also make it possible to scale up to a large number of categories that is currently impossible with discrete normalizing flows. Based on Categorical Normalizing Flows, we propose GraphCNF a permutation-invariant generative model on graphs. GraphCNF implements a three step approach modeling the nodes, edges, and adjacency matrix stepwise to increase efficiency. On molecule generation, GraphCNF outperforms both one-shot and autoregressive flow-based state-of-the-art.\n",
+    "title": "Categorical Normalizing Flows via Continuous Transformations",
+    "authors": [
+      "Phillip Lippe",
+      "Efstratios Gavves"
+    ],
+    "emails": [
+      "~Phillip_Lippe1",
+      "~Efstratios_Gavves1"
+    ],
+    "rank": 255
+  },
+  {
+    "url": "https://openreview.net/forum?id=068E_JSq9O",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper introduces Relative Predictive Coding (RPC), a new contrastive representation learning objective that maintains a good balance among training stability, minibatch size sensitivity, and downstream task performance. The key to the success of RPC is two-fold. First, RPC introduces the relative parameters to regularize the objective for boundedness and low variance. Second, RPC contains no logarithm and exponential score functions, which are the main cause of training instability in prior contrastive objectives. We empirically verify the effectiveness of RPC on benchmark vision and speech self-supervised learning tasks. Lastly, we relate RPC with mutual information (MI) estimation, showing RPC can be used to estimate MI with low variance.",
+    "title": "Self-supervised Representation Learning with Relative Predictive Coding",
+    "authors": [
+      "Yao-Hung Hubert Tsai",
+      "Martin Q. Ma",
+      "Muqiao Yang",
+      "Han Zhao",
+      "Louis-Philippe Morency",
+      "Ruslan Salakhutdinov"
+    ],
+    "emails": [
+      "~Yao-Hung_Hubert_Tsai1",
+      "~Han_Zhao1",
+      "~Louis-Philippe_Morency1",
+      "~Muqiao_Yang1",
+      "~Martin_Q._Ma1",
+      "~Ruslan_Salakhutdinov1"
+    ],
+    "rank": 256
+  },
+  {
+    "url": "https://openreview.net/forum?id=jLoC4ez43PZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.79",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Pre-trained models for programming language have achieved dramatic empirical improvements on a variety of code-related tasks such as code search, code completion, code summarization, etc. However, existing pre-trained models regard a code snippet as a sequence of tokens, while ignoring the inherent structure of code, which provides crucial code semantics and would enhance the code understanding process. We present GraphCodeBERT, a pre-trained model for programming language that considers the inherent structure of code. Instead of taking syntactic-level structure of code like abstract syntax tree (AST), we use data flow in the pre-training stage, which is a semantic-level structure of code that encodes the relation of \"where-the-value-comes-from\" between variables. Such a semantic-level structure is neat and does not bring an unnecessarily deep hierarchy of AST, the property of which makes the model more efficient. We develop GraphCodeBERT based on Transformer. In addition to using the task of masked language modeling, we introduce two structure-aware pre-training tasks. One is to predict code structure edges, and the other is to align representations between source code and code structure. We implement the model in an efficient way with a graph-guided masked attention function to incorporate the code structure. We evaluate our model on four tasks, including code search, clone detection, code translation, and code refinement. Results show that code structure and newly introduced pre-training tasks can improve GraphCodeBERT and achieves state-of-the-art performance on the four downstream tasks. We further show that the model prefers structure-level attentions over token-level attentions in the task of code search.",
+    "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
+    "authors": [
+      "Daya Guo",
+      "Shuo Ren",
+      "Shuai Lu",
+      "Zhangyin Feng",
+      "Duyu Tang",
+      "Shujie LIU",
+      "Long Zhou",
+      "Nan Duan",
+      "Alexey Svyatkovskiy",
+      "Shengyu Fu",
+      "Michele Tufano",
+      "Shao Kun Deng",
+      "Colin Clement",
+      "Dawn Drain",
+      "Neel Sundaresan",
+      "Jian Yin",
+      "Daxin Jiang",
+      "Ming Zhou"
+    ],
+    "emails": [
+      "ir.hit.edu.cn",
+      "~Duyu_Tang1",
+      "pku.edu.cn",
+      "microsoft.com",
+      "~Ming_Zhou1",
+      "~Nan_Duan1",
+      "~Daya_Guo2",
+      "~Shujie_LIU1",
+      "mail.sysu.edu.cn",
+      "buaa.edu.cn",
+      "~Long_Zhou2",
+      "~Alexey_Svyatkovskiy1"
+    ],
+    "rank": 257
+  },
+  {
+    "url": "https://openreview.net/forum?id=oZIvHV04XgC",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We aim to bridge the gap between typical human and machine-learning environments by extending the standard framework of few-shot learning to an online, continual setting. In this setting, episodes do not have separate training and testing phases, and instead models are evaluated online while learning novel classes. As in the real world, where the presence of spatiotemporal context helps us retrieve learned skills in the past, our online few-shot learning setting also features an underlying context that changes throughout time. Object classes are correlated within a context and inferring the correct context can lead to better performance. Building upon this setting, we propose a new few-shot learning dataset based on large scale indoor imagery that mimics the visual\nexperience of an agent wandering within a world. Furthermore, we convert popular few-shot learning approaches into online versions and we also propose a new model that can make use of spatiotemporal contextual information from the recent past.",
+    "title": "Wandering within a world: Online contextualized few-shot learning",
+    "authors": [
+      "Mengye Ren",
+      "Michael Louis Iuzzolino",
+      "Michael Curtis Mozer",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Mengye_Ren1",
+      "~Michael_Louis_Iuzzolino1",
+      "~Michael_Curtis_Mozer1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 258
+  },
+  {
+    "url": "https://openreview.net/forum?id=sy4Kg_ZQmS7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      3,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Instrumental variable (IV) regression is a standard strategy for learning causal relationships between confounded treatment and outcome variables from observational data by using an instrumental variable, which affects the outcome only through the treatment. In classical IV regression, learning proceeds in two stages: stage 1 performs linear regression from the instrument to the treatment; and stage 2 performs linear regression from the treatment to the outcome, conditioned on the instrument. We propose a novel method, deep feature instrumental variable regression (DFIV), to address the case where relations between instruments, treatments, and outcomes may be nonlinear. In this case, deep neural nets are trained to define informative nonlinear features on the instruments and treatments. We propose an alternating training regime for these features to ensure good end-to-end performance when composing stages 1 and 2, thus obtaining highly flexible feature maps in a computationally efficient manner.\nDFIV outperforms recent state-of-the-art methods on challenging IV benchmarks, including settings involving high dimensional image data. DFIV also exhibits competitive performance in off-policy policy evaluation for reinforcement learning, which can be understood as an IV regression task.",
+    "title": "Learning Deep Features in Instrumental Variable Regression",
+    "authors": [
+      "Liyuan Xu",
+      "Yutian Chen",
+      "Siddarth Srinivasan",
+      "Nando de Freitas",
+      "Arnaud Doucet",
+      "Arthur Gretton"
+    ],
+    "emails": [
+      "~Yutian_Chen1",
+      "~Arthur_Gretton1",
+      "~Arnaud_Doucet2",
+      "cs.washington.edu",
+      "~Nando_de_Freitas1",
+      "~Liyuan_Xu1"
+    ],
+    "rank": 259
+  },
+  {
+    "url": "https://openreview.net/forum?id=8HhkbjrWLdE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      8
+    ],
+    "rating": "6.79",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Numerical experiments demonstrate that deep neural network classifiers progressively separate class distributions around their mean, achieving linear separability on the training set, and increasing the Fisher discriminant ratio. We explain this mechanism with two types of operators. We prove that a rectifier without biases applied to sign-invariant tight frames can separate class means and increase Fisher ratios. On the opposite, a soft-thresholding on tight frames can reduce within-class variabilities while preserving class means. Variance reduction bounds are proved for Gaussian mixture models. For image classification, we show that separation of class means can be achieved with rectified wavelet tight frames that are not learned. It defines a scattering transform. Learning  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mo>\u00d7</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container> convolutional tight frames along scattering channels and applying a soft-thresholding reduces within-class variabilities. The resulting scattering network reaches the classification accuracy of ResNet-18 on CIFAR-10 and ImageNet, with fewer layers and no learned biases.",
+    "title": "Separation and Concentration in Deep Networks",
+    "authors": [
+      "John Zarka",
+      "Florentin Guth",
+      "St\u00e9phane Mallat"
+    ],
+    "emails": [
+      "~St\u00e9phane_Mallat1",
+      "~Florentin_Guth1",
+      "~John_Zarka1"
+    ],
+    "rank": 260
+  },
+  {
+    "url": "https://openreview.net/forum?id=aLIbnLY9NtH",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Despite their success in perception over the last decade, deep neural networks are also known ravenous to labeled data for training, which limits their applicability to real-world problems. Hence self-supervised learning has recently attracted intensive attention. Contrastive learning has been one of the dominant approaches for effective feature extraction and has also achieved state-of-the-art performance. In this paper, we first theoretically show that these methods cannot fully take advantage of training samples in the sense of nearest positive samples mining. Then we propose a new contrastive method called AdaCLR<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mi>p</mi><mi>r</mi><mi>e</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> (adaptive self-supervised contrastive learning representations), which can more effectively (supported by our proof) explore the samples in a way of being closer to supervised contrastive learning.  We thoroughly evaluate the quality of the learned representation on ImageNet for pretraining based version (AdaCLR<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mi>p</mi><mi>r</mi><mi>e</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>). The results of accuracy show AdaCLR<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mi>p</mi><mi>r</mi><mi>e</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> outperforms state-of-the-art contrastive-based models by 3.0\\% with extra 100 epochs.",
+    "title": "Self-supervised representation learning via adaptive hard-positive mining",
+    "authors": [
+      "Shaofeng Zhang",
+      "Junchi Yan",
+      "Xiaokang Yang"
+    ],
+    "emails": [
+      "~Xiaokang_Yang1",
+      "~Shaofeng_Zhang1",
+      "~Junchi_Yan2"
+    ],
+    "rank": 261
+  },
+  {
+    "url": "https://openreview.net/forum?id=O-6Pm_d_Q-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      7,
+      6
+    ],
+    "rating": "6.78",
+    "confidences": [
+      2,
+      1,
+      4,
+      2
+    ],
+    "abstract": "We study the multiple manifold problem, a binary classification task modeled on applications in machine vision, in which a deep fully-connected neural network is trained to separate two low-dimensional submanifolds of the unit sphere. We provide an analysis of the one-dimensional case, proving for a simple manifold configuration that when the network depth <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> is large relative to certain geometric and statistical properties of the data, the network width <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> grows as a sufficiently large polynomial in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container>, and the number of i.i.d. samples from the manifolds is polynomial in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container>, randomly-initialized gradient descent rapidly learns to classify the two manifolds perfectly with high probability. Our analysis demonstrates concrete benefits of depth and width in the context of a practically-motivated model problem: the depth acts as a fitting resource, with larger depths corresponding to smoother networks that can more readily separate the class manifolds, and the width acts as a statistical resource, enabling concentration of the randomly-initialized network and its gradients. The argument centers around the \"neural tangent kernel\" of Jacot et al. and its role in the nonasymptotic analysis of training overparameterized neural networks; to this literature, we contribute essentially optimal rates of concentration for the neural tangent kernel of deep fully-connected ReLU networks, requiring width <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mstyle><mjx-mspace style=\"width: 0.167em;\"></mjx-mspace></mjx-mstyle><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c70\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6F\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c79\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>\u2265</mo><mi>L</mi><mstyle scriptlevel=\"0\"><mspace width=\"thinmathspace\"></mspace></mstyle><mrow><mi mathvariant=\"normal\">p</mi><mi mathvariant=\"normal\">o</mi><mi mathvariant=\"normal\">l</mi><mi mathvariant=\"normal\">y</mi></mrow><mo stretchy=\"false\">(</mo><msub><mi>d</mi><mn>0</mn></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> to achieve uniform concentration of the initial kernel over a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>d</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-dimensional submanifold of the unit sphere <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c1D54A TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.429em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.164em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">S</mi></mrow><mrow><msub><mi>n</mi><mn>0</mn></msub><mo>\u2212</mo><mn>1</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>, and a nonasymptotic framework for establishing generalization of networks trained in the \"NTK regime\" with structured data. The proof makes heavy use of martingale concentration to optimally treat statistical dependencies across layers of the initial random network. This approach should be of use in establishing similar results for other network architectures.",
+    "title": "Deep Networks and the Multiple Manifold Problem",
+    "authors": [
+      "Sam Buchanan",
+      "Dar Gilboa",
+      "John Wright"
+    ],
+    "emails": [
+      "~John_Wright1",
+      "~Sam_Buchanan1",
+      "~Dar_Gilboa1"
+    ],
+    "rank": 262
+  },
+  {
+    "url": "https://openreview.net/forum?id=RSU17UoKfJF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.78",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "Federated learning frameworks have been regarded as a promising approach to break the dilemma between demands on privacy and the promise of learning from large collections of distributed data. Many such frameworks only ask collaborators to share their local update of a common model, i.e. gradients with respect to locally stored data, instead of exposing their raw data to other collaborators. However, recent optimization-based gradient attacks show that raw data can often be accurately recovered from gradients. It has been shown that minimizing the Euclidean distance between true gradients and those calculated from estimated data is often effective in fully recovering private data. However, there is a fundamental lack of theoretical understanding of how and when gradients can lead to unique recovery of original data. Our research fills this gap by providing a closed-form recursive procedure to recover data from gradients in deep neural networks. We name it Recursive Gradient Attack on Privacy (R-GAP). Experimental results demonstrate that R-GAP  works as well as or even better than optimization-based approaches at a fraction of the computation under certain conditions. Additionally, we propose a Rank Analysis method, which can be used to estimate the risk of gradient attacks inherent in certain network architectures, regardless of whether an optimization-based or closed-form-recursive attack is used. Experimental results demonstrate the utility of the rank analysis towards improving the network's security. Source code is available for download from <a href=\"https://github.com/JunyiZhu-AI/R-GAP\" target=\"_blank\" rel=\"nofollow\">https://github.com/JunyiZhu-AI/R-GAP</a>.",
+    "title": "R-GAP: Recursive Gradient Attack on Privacy",
+    "authors": [
+      "Junyi Zhu",
+      "Matthew B. Blaschko"
+    ],
+    "emails": [
+      "~Matthew_B._Blaschko1",
+      "~Junyi_Zhu1"
+    ],
+    "rank": 263
+  },
+  {
+    "url": "https://openreview.net/forum?id=gLWj29369lW",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Many models learn representations of knowledge graph data by exploiting its low-rank latent structure, encoding known relations between entities and enabling unknown facts to be inferred. To predict whether a relation holds between entities, embeddings are typically compared in the latent space following a relation-specific mapping. Whilst their predictive performance has steadily improved, how such models capture the underlying latent structure of semantic information remains unexplained. Building on recent theoretical understanding of word embeddings, we categorise knowledge graph relations into three types and for each derive explicit requirements of their representations. We show that empirical properties of relation representations and the relative performance of leading knowledge graph representation methods are justified by our analysis.",
+    "title": "Interpreting Knowledge Graph Relation Representation from Word Embeddings",
+    "authors": [
+      "Carl Allen",
+      "Ivana Balazevic",
+      "Timothy Hospedales"
+    ],
+    "emails": [
+      "~Ivana_Balazevic1",
+      "~Carl_Allen1",
+      "~Timothy_Hospedales1"
+    ],
+    "rank": 264
+  },
+  {
+    "url": "https://openreview.net/forum?id=y06VOYLcQXa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      8
+    ],
+    "rating": "6.77",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "System side channels denote effects imposed on the underlying system and hardware when running a program, such as its accessed CPU cache lines. Side channel analysis (SCA) allows attackers to infer program secrets based on observed side channel signals. Given the ever-growing adoption of machine learning as a service (MLaaS), image analysis software on cloud platforms has been exploited by reconstructing private user images from system side channels. Nevertheless, to date, SCA is still highly challenging, requiring technical knowledge of victim software's internal operations. For existing SCA attacks, comprehending such internal operations requires heavyweight program analysis or manual efforts.\n\nThis research proposes an attack framework to reconstruct private user images processed by media software via system side channels. The framework forms an effective workflow by incorporating convolutional networks, variational autoencoders, and generative adversarial networks. Our evaluation of two popular side channels shows that the reconstructed images consistently match user inputs, making privacy leakage attacks more practical. We also show surprising results that even one-bit data read/write pattern side channels, which are deemed minimally informative, can be used to reconstruct quality images using our framework.",
+    "title": "Private Image Reconstruction from System Side Channels Using Generative Models",
+    "authors": [
+      "Yuanyuan Yuan",
+      "Shuai Wang",
+      "Junping Zhang"
+    ],
+    "emails": [
+      "~Yuanyuan_Yuan1",
+      "~Junping_Zhang2",
+      "~Shuai_Wang7"
+    ],
+    "rank": 265
+  },
+  {
+    "url": "https://openreview.net/forum?id=2VXyy9mIyU3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8
+    ],
+    "rating": "6.77",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Human-annotated labels are often prone to noise, and the presence of such noise will degrade the performance of the resulting deep neural network (DNN) models. Much of the literature (with several recent exceptions) of learning with noisy labels focuses on the case when the label noise is independent of features. Practically, annotations errors tend to be instance-dependent and often depend on the difficulty levels of recognizing a certain task. Applying existing results from instance-independent settings would require a significant amount of estimation of noise rates. Therefore, providing theoretically rigorous solutions for learning with instance-dependent label noise remains a challenge. In this paper, we propose CORES<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mn>2</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> (COnfidence REgularized Sample Sieve), which progressively sieves out corrupted examples. The implementation of CORES<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mn>2</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>  does not require specifying noise rates and yet we are able to provide theoretical guarantees of CORES<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mn>2</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> in filtering out the corrupted examples. This high-quality sample sieve allows us to treat clean examples and the corrupted ones separately in training a DNN solution, and such a separation is shown to be advantageous in the instance-dependent noise setting. We demonstrate the performance of CORES<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mn>2</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> on CIFAR10 and CIFAR100 datasets with synthetic instance-dependent label noise and Clothing1M with real-world human noise. As of independent interests, our sample sieve provides a generic machinery for anatomizing noisy datasets and provides a flexible interface for various robust training techniques to further improve the performance. Code is available at <a href=\"https://github.com/UCSC-REAL/cores\" target=\"_blank\" rel=\"nofollow\">https://github.com/UCSC-REAL/cores</a>.",
+    "title": "Learning with Instance-Dependent Label Noise: A Sample Sieve Approach",
+    "authors": [
+      "Hao Cheng",
+      "Zhaowei Zhu",
+      "Xingyu Li",
+      "Yifei Gong",
+      "Xing Sun",
+      "Yang Liu"
+    ],
+    "emails": [
+      "~Hao_Cheng5",
+      "~Yifei_Gong1",
+      "~Xing_Sun1",
+      "~Zhaowei_Zhu1",
+      "~Yang_Liu3",
+      "~Xingyu_Li2"
+    ],
+    "rank": 266
+  },
+  {
+    "url": "https://openreview.net/forum?id=193sEnKY1ij",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.77",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "There has been increasing interest in building deep hierarchy-aware classifiers that aim to quantify and reduce the severity of mistakes, and not just reduce the number of errors. The idea is to exploit the label hierarchy (e.g., the WordNet ontology) and consider graph distances as a proxy for mistake severity. Surprisingly, on examining mistake-severity distributions of the top-1 prediction, we find that current state-of-the-art hierarchy-aware deep classifiers do not always show practical improvement over the standard cross-entropy baseline in making better mistakes. The reason for the reduction in average mistake-severity can be attributed to the increase in low-severity mistakes, which may also explain the noticeable drop in their accuracy. To this end, we use the classical Conditional Risk Minimization (CRM) framework for hierarchy-aware classification. Given a cost matrix and a reliable estimate of likelihoods (obtained from a trained network), CRM simply amends mistakes at inference time; it needs no extra hyperparameters and requires adding just a few lines of code to the standard cross-entropy baseline. It significantly outperforms the state-of-the-art and consistently obtains large reductions in the average hierarchical distance of top-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> predictions across datasets, with very little loss in accuracy. CRM, because of its simplicity, can be used with any off-the-shelf trained model that provides reliable likelihood estimates.",
+    "title": "No Cost Likelihood Manipulation at Test Time for Making Better Mistakes in Deep Networks",
+    "authors": [
+      "Shyamgopal Karthik",
+      "Ameya Prabhu",
+      "Puneet K. Dokania",
+      "Vineet Gandhi"
+    ],
+    "emails": [
+      "~Puneet_K._Dokania1",
+      "~Ameya_Prabhu1",
+      "~Shyamgopal_Karthik1",
+      "~Vineet_Gandhi2"
+    ],
+    "rank": 267
+  },
+  {
+    "url": "https://openreview.net/forum?id=MtEE0CktZht",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.77",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Exploration under sparse reward is a long-standing challenge of model-free reinforcement learning. The state-of-the-art methods address this challenge by introducing intrinsic rewards to encourage exploration in novel states or uncertain environment dynamics. Unfortunately, methods based on intrinsic rewards often fall short in procedurally-generated environments, where a different environment is generated in each episode so that the agent is not likely to visit the same state more than once. Motivated by how humans distinguish good exploration behaviors by looking into the entire episode, we introduce RAPID, a simple yet effective episode-level exploration method for procedurally-generated environments. RAPID regards each episode as a whole and gives an episodic exploration score from both per-episode and long-term views. Those highly scored episodes are treated as good exploration behaviors and are stored in a small ranking buffer. The agent then imitates the episodes in the buffer to reproduce the past good exploration behaviors. We demonstrate our method on several procedurally-generated MiniGrid environments, a first-person-view 3D Maze navigation task from MiniWorld, and several sparse MuJoCo tasks. The results show that RAPID significantly outperforms the state-of-the-art intrinsic reward strategies in terms of sample efficiency and final performance. The code is available at <a href=\"https://github.com/daochenzha/rapid\" target=\"_blank\" rel=\"nofollow\">https://github.com/daochenzha/rapid</a>",
+    "title": "Rank the Episodes: A Simple Approach for Exploration in Procedurally-Generated Environments",
+    "authors": [
+      "Daochen Zha",
+      "Wenye Ma",
+      "Lei Yuan",
+      "Xia Hu",
+      "Ji Liu"
+    ],
+    "emails": [
+      "~Lei_Yuan1",
+      "~Daochen_Zha1",
+      "gmail.com",
+      "~Ji_Liu1",
+      "~Xia_Hu4"
+    ],
+    "rank": 268
+  },
+  {
+    "url": "https://openreview.net/forum?id=aLtty4sUo0o",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.77",
+    "confidences": [
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "We consider the quickest change detection problem where both the parameters of pre- and post- change distributions are unknown, which prevent the use of classical simple hypothesis testing. Without additional assumptions, optimal solutions are not tractable as they rely on some minimax and robust variant of the objective. As a consequence, change points might be detected too late for practical applications (in economics, health care or maintenance for instance). \nOther approaches solve a relaxed version of the problem through the use of particular probability distributions or the use of domain knowledge. \n\nWe tackle this problem in the more complex Markovian case and we provide a new scalable approximate algorithm with near optimal performance that runs in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. ",
+    "title": "Quickest change detection for multi-task problems under unknown parameters",
+    "authors": [
+      "Firas Jarboui",
+      "Vianney Perchet"
+    ],
+    "emails": [
+      "~Firas_Jarboui1",
+      "~Vianney_Perchet3"
+    ],
+    "rank": 269
+  },
+  {
+    "url": "https://openreview.net/forum?id=8Sqhl-nF50",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      3,
+      8,
+      8
+    ],
+    "rating": "6.77",
+    "confidences": [
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "We study the approximation properties and optimization dynamics of recurrent neural networks (RNNs) when applied to learn input-output relationships in temporal data. We consider the simple but representative setting of using continuous-time linear RNNs to learn from data generated by linear relationships. Mathematically, the latter can be understood as a sequence of linear functionals.  We prove a universal approximation theorem of such linear functionals and characterize the approximation rate.  Moreover, we perform a fine-grained dynamical analysis of training linear RNNs by gradient methods.  A unifying theme uncovered is the non-trivial effect of memory, a notion that can be made precise in our framework, on both approximation and optimization: when there is long-term memory in the target, it takes a large number of neurons to approximate it. Moreover, the training process will suffer from slow downs.  In particular, both of these effects become exponentially more pronounced with increasing memory - a phenomenon we call the \u201ccurse of memory\u201d.  These analyses represent a basic step towards a concrete mathematical understanding of new phenomenons that may arise in learning temporal relationships using recurrent architectures.",
+    "title": "On the Curse of Memory in Recurrent Neural Networks: Approximation and Optimization Analysis",
+    "authors": [
+      "Zhong Li",
+      "Jiequn Han",
+      "Weinan E",
+      "Qianxiao Li"
+    ],
+    "emails": [
+      "~Qianxiao_Li1",
+      "~Weinan_E1",
+      "~Jiequn_Han1",
+      "~Zhong_Li2"
+    ],
+    "rank": 270
+  },
+  {
+    "url": "https://openreview.net/forum?id=7R7fAoUygoa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.77",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recent empirical and theoretical studies have shown that many learning algorithms -- from linear regression to neural networks -- can have test performance that is non-monotonic in quantities such the sample size and model size. This striking phenomenon, often referred to as \"double descent\", has raised questions of if we need to re-think our current understanding of generalization. In this work, we study whether the double-descent phenomenon can be avoided by using optimal regularization. Theoretically, we prove that for certain linear regression models with isotropic data distribution, optimally-tuned <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> regularization achieves monotonic test performance as we grow either the sample size or the model size.\nWe also demonstrate empirically that optimally-tuned <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> regularization can mitigate double descent for more general models, including neural networks.\nOur results suggest that it may also be informative to study the test risk scalings of various algorithms in the context of appropriately tuned regularization.",
+    "title": "Optimal Regularization can Mitigate Double Descent",
+    "authors": [
+      "Preetum Nakkiran",
+      "Prayaag Venkat",
+      "Sham M. Kakade",
+      "Tengyu Ma"
+    ],
+    "emails": [
+      "g.harvard.edu",
+      "~Sham_M._Kakade1",
+      "~Tengyu_Ma1",
+      "~Preetum_Nakkiran1"
+    ],
+    "rank": 271
+  },
+  {
+    "url": "https://openreview.net/forum?id=9l0K4OM-oXE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.76",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Deep neural networks (DNNs) are known vulnerable to backdoor attacks, a training time attack that injects a trigger pattern into a small proportion of training data so as to control the model's prediction at the test time. Backdoor attacks are notably dangerous since they do not affect the model's performance on clean examples, yet can fool the model to make the incorrect prediction whenever the trigger pattern appears during testing. In this paper, we propose a novel defense framework Neural Attention Distillation (NAD) to erase backdoor triggers from backdoored DNNs. NAD utilizes a teacher network to guide the finetuning of the backdoored student network on a small clean subset of data such that the intermediate-layer attention of the student network aligns with that of the teacher network. The teacher network can be obtained by an independent finetuning process on the same clean subset. We empirically show, against 6 state-of-the-art backdoor attacks,  NAD can effectively erase the backdoor triggers using only 5\\% clean training data without causing obvious performance degradation on clean examples. Our code is available at <a href=\"https://github.com/bboylyg/NAD\" target=\"_blank\" rel=\"nofollow\">https://github.com/bboylyg/NAD</a>.",
+    "title": "Neural Attention Distillation: Erasing Backdoor Triggers from Deep Neural Networks",
+    "authors": [
+      "Yige Li",
+      "Xixiang Lyu",
+      "Nodens Koren",
+      "Lingjuan Lyu",
+      "Bo Li",
+      "Xingjun Ma"
+    ],
+    "emails": [
+      "~Yige_Li1",
+      "~Lingjuan_Lyu1",
+      "mail.xidian.edu.cn",
+      "~Bo_Li19",
+      "~Xingjun_Ma1",
+      "~Nodens_Koren1"
+    ],
+    "rank": 272
+  },
+  {
+    "url": "https://openreview.net/forum?id=kE3vd639uRW",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      8,
+      7
+    ],
+    "rating": "6.76",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Pooling is a critical operation in convolutional neural networks for increasing receptive fields and improving robustness to input variations. Most existing pooling operations downsample the feature maps,  which is a lossy process.   Moreover, they are not invertible: upsampling a downscaled feature map can not recover the lost information in the downsampling.  By adopting the philosophy of the classical Lifting Scheme from signal processing, we propose LiftPool for bidirectional pooling layers, including LiftDownPool and LiftUpPool.  LiftDownPool decomposes a feature map into various downsized sub-bands,  each of which contains information with different frequencies. As the pooling function in LiftDownPool is perfectly invertible, by performing LiftDownPool backward, a corresponding up-pooling layer LiftUpPool is able to generate a refined upsampled feature map using the detail subbands, which is useful for image-to-image translation challenges.  Experiments show the proposed methods achieve better results on image classification and semantic segmentation,  using various backbones. Moreover, LiftDownPool offers better robustness to input corruptions and perturbations.",
+    "title": "LiftPool: Bidirectional ConvNet Pooling",
+    "authors": [
+      "Jiaojiao Zhao",
+      "Cees G. M. Snoek"
+    ],
+    "emails": [
+      "~Cees_G._M._Snoek1",
+      "~Jiaojiao_Zhao2"
+    ],
+    "rank": 273
+  },
+  {
+    "url": "https://openreview.net/forum?id=eMP1j9efXtX",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.76",
+    "confidences": [
+      4,
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We study an approach to offline reinforcement learning (RL) based on optimally solving  finitely-represented  MDPs  derived  from  a  static  dataset  of  experience. This approach can be applied on top of any learned representation and has the potential to easily support multiple solution objectives as well as zero-shot adjustment to changing environments and goals.  Our main contribution is to introduce the Deep Averagers with Costs MDP (DAC-MDP) and to investigate its solutions for offline RL.  DAC-MDPs are a non-parametric model that can leverage deep representations and account for limited data by introducing costs for exploiting under-represented parts of the model.  In theory, we show conditions that allow for lower-bounding the performance of DAC-MDP solutions. We also investigate the empirical behavior in a number of environments, including those with image-based observations. Overall, the experiments demonstrate that the framework can work in practice and scale to large complex offline RL problems.",
+    "title": "DeepAveragers: Offline Reinforcement Learning By Solving Derived Non-Parametric MDPs",
+    "authors": [
+      "Aayam Kumar Shrestha",
+      "Stefan Lee",
+      "Prasad Tadepalli",
+      "Alan Fern"
+    ],
+    "emails": [
+      "~Stefan_Lee1",
+      "~Prasad_Tadepalli1",
+      "~Aayam_Kumar_Shrestha1",
+      "~Alan_Fern1"
+    ],
+    "rank": 274
+  },
+  {
+    "url": "https://openreview.net/forum?id=Eql5b1_hTE4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.76",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "The \\textit{memorization effects} of deep networks show that they will first memorize training data with clean labels and then those with noisy labels. The \\textit{early stopping} method therefore can be exploited for learning with noisy labels. However, the side effect brought by noisy labels will influence the memorization of clean labels before early stopping. In this paper, motivated by the \\textit{lottery ticket hypothesis} which shows that only partial parameters are important for generalization, we find that only partial parameters are important for fitting clean labels and generalize well, which we term as \\textit{critical parameters}; while the other parameters tend to fit noisy labels and cannot generalize well, which we term as \\textit{non-critical parameters}. Based on this, we propose \\textit{robust early-learning} to reduce the side effect of noisy labels before early stopping and thus enhance the memorization of clean labels. Specifically, in each iteration, we divide all parameters into the critical and non-critical ones, and then perform different update rules for different types of parameters. Extensive experiments on benchmark-simulated and real-world label-noise datasets demonstrate the superiority of the proposed method over the state-of-the-art label-noise learning methods.",
+    "title": "Robust early-learning: Hindering the memorization of noisy labels",
+    "authors": [
+      "Xiaobo Xia",
+      "Tongliang Liu",
+      "Bo Han",
+      "Chen Gong",
+      "Nannan Wang",
+      "Zongyuan Ge",
+      "Yi Chang"
+    ],
+    "emails": [
+      "~Nannan_Wang1",
+      "~Chen_Gong5",
+      "~Xiaobo_Xia1",
+      "jlu.edu.cn",
+      "~Bo_Han1",
+      "~Zongyuan_Ge1",
+      "~Tongliang_Liu1"
+    ],
+    "rank": 275
+  },
+  {
+    "url": "https://openreview.net/forum?id=a2gqxKDvYys",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.76",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Amortised inference enables scalable learning of sequential latent-variable models (LVMs) with the evidence lower bound (ELBO). In this setting, variational posteriors are often only partially conditioned. While the true posteriors depend, e.g., on the entire sequence of observations, approximate posteriors are only informed by past observations. This mimics the Bayesian filter---a mixture of smoothing posteriors. Yet, we show that the ELBO objective forces partially-conditioned amortised posteriors to approximate products of smoothing posteriors instead. Consequently, the learned generative model is compromised. We demonstrate these theoretical findings in three scenarios: traffic flow, handwritten digits, and aerial vehicle dynamics. Using fully-conditioned approximate posteriors, performance improves in terms of generative modelling and multi-step prediction.",
+    "title": "Mind the Gap when Conditioning Amortised Inference in Sequential Latent-Variable Models",
+    "authors": [
+      "Justin Bayer",
+      "Maximilian Soelch",
+      "Atanas Mirchev",
+      "Baris Kayalibay",
+      "Patrick van der Smagt"
+    ],
+    "emails": [
+      "~Baris_Kayalibay1",
+      "~Justin_Bayer1",
+      "~Patrick_van_der_Smagt1",
+      "~Maximilian_Soelch1",
+      "~Atanas_Mirchev1"
+    ],
+    "rank": 276
+  },
+  {
+    "url": "https://openreview.net/forum?id=uQfOy7LrlTR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.76",
+    "confidences": [
+      3,
+      5,
+      2,
+      4,
+      2,
+      1
+    ],
+    "abstract": "Tight and efficient neural network bounding is of critical importance for the scaling of neural network verification systems. A number of efficient specialised dual solvers for neural network bounds have been presented recently, but they are often too loose to verify more challenging properties. This lack of tightness is linked to the weakness of the employed relaxation, which is usually a linear program of size linear in the number of neurons. While a tighter linear relaxation for piecewise linear activations exists, it comes at the cost of exponentially many constraints and thus currently lacks an efficient customised solver. We alleviate this deficiency via a novel dual algorithm that realises the full potential of the new relaxation by operating on a small active set of dual variables. Our method recovers the strengths of the new relaxation in the dual space: tightness and a linear separation oracle. At the same time, it shares the benefits of previous dual approaches for weaker relaxations: massive parallelism, GPU implementation, low cost per iteration and valid bounds at any time. As a consequence, we obtain better bounds than off-the-shelf solvers in only a fraction of their running time and recover the speed-accuracy trade-offs of looser dual solvers if the computational budget is small. We demonstrate that this results in significant formal verification speed-ups.",
+    "title": "Scaling the Convex Barrier with Active Sets",
+    "authors": [
+      "Alessandro De Palma",
+      "Harkirat Behl",
+      "Rudy R Bunel",
+      "Philip Torr",
+      "M. Pawan Kumar"
+    ],
+    "emails": [
+      "~Alessandro_De_Palma1",
+      "~M._Pawan_Kumar1",
+      "~Rudy_R_Bunel1",
+      "~Harkirat_Behl1",
+      "~Philip_Torr1"
+    ],
+    "rank": 277
+  },
+  {
+    "url": "https://openreview.net/forum?id=tV6oBfuyLTQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.76",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Traditional off-policy actor-critic Reinforcement Learning (RL) algorithms learn value functions of a single target policy. However, when value functions are updated to track the learned policy, they forget potentially useful information about old policies. We introduce a class of value functions called Parameter-Based Value Functions (PBVFs) whose inputs include the policy parameters. They can generalize across different policies. PBVFs can evaluate the performance of any policy given a state, a state-action pair, or a distribution over the RL agent's initial states. First we show how PBVFs yield novel off-policy policy gradient theorems. Then we derive off-policy actor-critic algorithms based on PBVFs trained by Monte Carlo or Temporal Difference methods. We show how learned PBVFs can zero-shot learn new policies that outperform any policy seen during training. Finally our algorithms are evaluated on a selection of discrete and continuous control tasks using shallow policies and deep neural networks. Their performance is comparable to state-of-the-art methods.",
+    "title": "Parameter-Based Value Functions",
+    "authors": [
+      "Francesco Faccio",
+      "Louis Kirsch",
+      "J\u00fcrgen Schmidhuber"
+    ],
+    "emails": [
+      "~Louis_Kirsch1",
+      "~J\u00fcrgen_Schmidhuber1",
+      "~Francesco_Faccio1"
+    ],
+    "rank": 278
+  },
+  {
+    "url": "https://openreview.net/forum?id=uBHs6zpY4in",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      9,
+      5,
+      6
+    ],
+    "rating": "6.76",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Measuring the discrepancy between two probability distributions is a fundamental problem in machine learning and statistics. Based on ideas from decision theory, we investigate a new class of discrepancies that are based on the optimal decision loss. Two probability distributions are different if the optimal decision loss is higher on the mixture distribution than on each individual distribution. We show that this generalizes popular notions of discrepancy measurements such as the Jensen Shannon divergence and the maximum mean discrepancy. We apply our approach to two-sample tests, which evaluates whether two sets of samples come from the same distribution. On various benchmark and real datasets, we demonstrate that tests based on our generalized notion of discrepancy is able to achieve superior test power. We also apply our approach to sample quality evaluation as an alternative to the FID score, and to understanding the effects of climate change on different social and economic activities.",
+    "title": "H-divergence: A Decision-Theoretic Probability Discrepancy Measure ",
+    "authors": [
+      "Shengjia Zhao",
+      "Abhishek Sinha",
+      "Yutong He",
+      "Aidan Perreault",
+      "Jiaming Song",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Yutong_He1",
+      "~Jiaming_Song1",
+      "~Aidan_Perreault1",
+      "~Abhishek_Sinha1",
+      "~Stefano_Ermon1",
+      "~Shengjia_Zhao1"
+    ],
+    "rank": 279
+  },
+  {
+    "url": "https://openreview.net/forum?id=pBqLS-7KYAF",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      3,
+      3,
+      2
+    ],
+    "abstract": "Given a large data matrix, sparsifying, quantizing, and/or performing other entry-wise nonlinear operations can have numerous benefits, ranging from speeding up iterative algorithms for core numerical linear algebra problems to providing nonlinear filters to design state-of-the-art neural network models. Here, we exploit tools from random matrix theory to make precise statements about how the eigenspectrum of a matrix changes under such nonlinear transformations. In particular, we show that very little change occurs in the informative eigenstructure, even under drastic sparsification/quantization, and consequently that very little downstream performance loss occurs when working with very aggressively sparsified or quantized spectral clustering problems.\nWe illustrate how these results depend on the nonlinearity, we characterize a phase transition beyond which spectral clustering becomes possible, and we show when such nonlinear transformations can introduce spurious non-informative eigenvectors.",
+    "title": "Sparse Quantized Spectral Clustering",
+    "authors": [
+      "Zhenyu Liao",
+      "Romain Couillet",
+      "Michael W. Mahoney"
+    ],
+    "emails": [
+      "~Romain_Couillet1",
+      "~Michael_W._Mahoney1",
+      "~Zhenyu_Liao1"
+    ],
+    "rank": 280
+  },
+  {
+    "url": "https://openreview.net/forum?id=tC6iW2UUbJf",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      8
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Contrastive visual pretraining based on the instance discrimination pretext task has made significant progress. Notably, recent work on unsupervised pretraining has shown to surpass the supervised counterpart for finetuning downstream applications such as object detection and segmentation.   It comes as a surprise that image annotations would be better left unused for transfer learning.  In this work, we investigate the following problems: What makes instance discrimination pretraining good for transfer learning? What knowledge is actually learned and transferred from these models?  From this understanding of instance discrimination, how can we better exploit human annotation labels for pretraining? Our findings are threefold. First, what truly matters for the transfer is low-level and mid-level representations, not high-level representations.  Second, the intra-category invariance enforced by the traditional supervised model weakens transferability by increasing task misalignment. Finally, supervised pretraining can be strengthened by following an exemplar-based approach without explicit constraints among the instances within the same category.",
+    "title": "What Makes Instance Discrimination Good for Transfer Learning?",
+    "authors": [
+      "Nanxuan Zhao",
+      "Zhirong Wu",
+      "Rynson W. H. Lau",
+      "Stephen Lin"
+    ],
+    "emails": [
+      "~Zhirong_Wu1",
+      "~Rynson_W._H._Lau1",
+      "~Nanxuan_Zhao1",
+      "~Stephen_Lin1"
+    ],
+    "rank": 281
+  },
+  {
+    "url": "https://openreview.net/forum?id=qVyeW-grC2k",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transformers do not scale very well to long sequence lengths largely because of quadratic self-attention complexity. In the recent months, a wide spectrum of efficient, fast Transformers have been proposed to tackle this problem, more often than not claiming superior or comparable model quality to vanilla Transformer models. To this date, there is no well-established consensus on how to evaluate this class of models. Moreover, inconsistent benchmarking on a wide spectrum of tasks and datasets makes it difficult to assess relative model quality amongst many models. This paper proposes a systematic and unified benchmark, Long Range Arena, specifically focused on evaluating model quality under long-context scenarios. Our benchmark is a suite of tasks consisting of sequences ranging from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mi>K</mi></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>16</mn><mi>K</mi></math></mjx-assistive-mml></mjx-container> tokens, encompassing a wide range of data types and modalities such as text, natural, synthetic images, and mathematical expressions requiring similarity, structural, and visual-spatial reasoning. We systematically evaluate ten well-established long-range Transformer models (Reformers, Linformers, Linear Transformers, Sinkhorn Transformers, Performers, Synthesizers, Sparse Transformers, and Longformers) on our newly proposed benchmark suite. Long Range Arena paves the way towards better understanding this class of efficient Transformer models, facilitates more research in this direction, and presents new challenging tasks to tackle.",
+    "title": "Long Range Arena : A Benchmark for Efficient Transformers ",
+    "authors": [
+      "Yi Tay",
+      "Mostafa Dehghani",
+      "Samira Abnar",
+      "Yikang Shen",
+      "Dara Bahri",
+      "Philip Pham",
+      "Jinfeng Rao",
+      "Liu Yang",
+      "Sebastian Ruder",
+      "Donald Metzler"
+    ],
+    "emails": [
+      "~Dara_Bahri1",
+      "~Mostafa_Dehghani1",
+      "~Sebastian_Ruder2",
+      "~Samira_Abnar1",
+      "~Jinfeng_Rao2",
+      "~Yi_Tay1",
+      "google.com",
+      "~Yikang_Shen1",
+      "~Philip_Pham1"
+    ],
+    "rank": 282
+  },
+  {
+    "url": "https://openreview.net/forum?id=dFwBosAcJkN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "A key challenge in adversarial robustness is the lack of a precise mathematical characterization of human perception, used in the definition of adversarial attacks that are imperceptible to human eyes. Most current attacks and defenses try to get around this issue by considering restrictive adversarial threat models such as those bounded by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> or <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> distance, spatial perturbations, etc. However, models that are robust against any of these restrictive threat models are still fragile against other threat models, i.e. they have poor generalization to unforeseen attacks. Moreover, even if a model is robust against the union of several restrictive threat models, it is still susceptible to other imperceptible adversarial examples that are not contained in any of the constituent threat models. To resolve these issues, we propose adversarial training against the set of all imperceptible adversarial examples. Since this set is intractable to compute without a human in the loop, we approximate it using deep neural networks. We call this threat model the neural perceptual threat model (NPTM); it includes adversarial examples with a bounded neural perceptual distance (a neural network-based approximation of the true perceptual distance) to natural images. Through an extensive perceptual study, we show that the neural perceptual distance correlates well with human judgements of perceptibility of adversarial examples, validating our threat model.\n\nUnder the NPTM, we develop novel perceptual adversarial attacks and defenses. Because the NPTM is very broad, we find that Perceptual Adversarial Training (PAT) against a perceptual attack gives robustness against many other types of adversarial attacks. We test PAT on CIFAR-10 and ImageNet-100 against five diverse adversarial attacks: <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container>, spatial, recoloring, and JPEG. We find that PAT achieves state-of-the-art robustness against the union of these five attacks\u2014more than doubling the accuracy over the next best model\u2014without training against any of them. That is, PAT generalizes well to unforeseen perturbation types. This is vital in sensitive applications where a particular threat model cannot be assumed, and to the best of our knowledge, PAT is the first adversarial defense with this property.\n\nCode and data are available at <a href=\"https://github.com/cassidylaidlaw/perceptual-advex\" target=\"_blank\" rel=\"nofollow\">https://github.com/cassidylaidlaw/perceptual-advex</a>",
+    "title": "Perceptual Adversarial Robustness: Defense Against Unseen Threat Models",
+    "authors": [
+      "Cassidy Laidlaw",
+      "Sahil Singla",
+      "Soheil Feizi"
+    ],
+    "emails": [
+      "~Soheil_Feizi2",
+      "~Cassidy_Laidlaw1",
+      "~Sahil_Singla1"
+    ],
+    "rank": 283
+  },
+  {
+    "url": "https://openreview.net/forum?id=MBOyiNnYthd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper we analyse and improve integer discrete flows for lossless compression. Integer discrete flows are a recently proposed class of models that learn invertible transformations for integer-valued random variables. Their discrete nature makes them particularly suitable for lossless compression with entropy coding schemes. We start by investigating a recent theoretical claim that states that invertible flows for discrete random variables are less flexible than their continuous counterparts. We demonstrate with a proof that this claim does not hold for integer discrete flows due to the embedding of data with finite support into the countably infinite integer lattice. Furthermore, we zoom in on the effect of gradient bias due to the straight-through estimator in integer discrete flows, and demonstrate that its influence is highly dependent on architecture choices and less prominent than previously thought. Finally, we show how different architecture modifications improve the performance of this model class for lossless compression, and that they also enable more efficient compression: a model with half the number of flow layers performs on par with or better than the original integer discrete flow model.",
+    "title": "IDF++: Analyzing and Improving Integer Discrete Flows for Lossless Compression",
+    "authors": [
+      "Rianne van den Berg",
+      "Alexey A. Gritsenko",
+      "Mostafa Dehghani",
+      "Casper Kaae S\u00f8nderby",
+      "Tim Salimans"
+    ],
+    "emails": [
+      "~Tim_Salimans1",
+      "~Mostafa_Dehghani1",
+      "~Casper_Kaae_S\u00f8nderby1",
+      "~Alexey_A._Gritsenko1",
+      "~Rianne_van_den_Berg1"
+    ],
+    "rank": 284
+  },
+  {
+    "url": "https://openreview.net/forum?id=uR9LaO_QxF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many real-world applications such as robotics provide hard constraints on power and compute that limit the viable model complexity of Reinforcement Learning (RL) agents. Similarly, in many distributed RL settings, acting is done on un-accelerated hardware such as CPUs, which likewise restricts model size to prevent intractable experiment run times. These \"actor-latency\" constrained settings present a major obstruction to the scaling up of model complexity that has recently been extremely successful in supervised learning. To be able to utilize large model capacity while still operating within the limits imposed by the system during acting, we develop an \"Actor-Learner Distillation\" (ALD) procedure that leverages a continual form of distillation that transfers learning progress from a large capacity learner model to a small capacity actor model. As a case study, we develop this procedure in the context of partially-observable environments, where transformer models have had large improvements over LSTMs recently, at the cost of significantly higher computational complexity. With transformer models as the learner and LSTMs as the actor, we demonstrate in several challenging memory environments that using Actor-Learner Distillation largely recovers the clear sample-efficiency gains of the transformer learner model while maintaining the fast inference and reduced total training time of the LSTM actor model.",
+    "title": "Efficient Transformers in Reinforcement Learning using Actor-Learner Distillation",
+    "authors": [
+      "Emilio Parisotto",
+      "Russ Salakhutdinov"
+    ],
+    "emails": [
+      "~Emilio_Parisotto1",
+      "~Russ_Salakhutdinov1"
+    ],
+    "rank": 285
+  },
+  {
+    "url": "https://openreview.net/forum?id=mCtadqIxOJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Synthesizing programs from examples requires searching over a vast, combinatorial space of possible programs. In this search process, a key challenge is representing the behavior of a partially written program before it can be executed, to judge if it is on the right track and predict where to search next. We introduce a general technique for representing partially written programs in a program synthesis engine. We take inspiration from the technique of abstract interpretation, in which an approximate execution model is used to determine if an unfinished program will eventually satisfy a goal specification. Here we learn an approximate execution model implemented as a modular neural network. By constructing compositional program representations that implicitly encode the interpretation semantics of the underlying programming language, we can represent partial programs using a flexible combination of concrete execution state and learned neural representations, using the learned approximate semantics when concrete semantics are not known (in unfinished parts of the program). We show that these hybrid neuro-symbolic representations enable execution-guided synthesizers to use more powerful language constructs, such as loops and higher-order functions, and can be used to synthesize programs more accurately for a given search budget than pure neural approaches in several domains.",
+    "title": "Representing Partial Programs with Blended Abstract Semantics",
+    "authors": [
+      "Maxwell Nye",
+      "Yewen Pu",
+      "Matthew Bowers",
+      "Jacob Andreas",
+      "Joshua B. Tenenbaum",
+      "Armando Solar-Lezama"
+    ],
+    "emails": [
+      "~Armando_Solar-Lezama1",
+      "~Jacob_Andreas1",
+      "mit.edu",
+      "~Yewen_Pu1",
+      "~Joshua_B._Tenenbaum1",
+      "~Maxwell_Nye1"
+    ],
+    "rank": 286
+  },
+  {
+    "url": "https://openreview.net/forum?id=c9-WeM-ceB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Poor generalization is one symptom of models that learn to predict target variables using spuriously-correlated image features present only in the training distribution instead of the true image features that denote a class. It is often thought that this can be diagnosed visually using attribution (aka saliency) maps. We study if this assumption is correct. In some prediction tasks, such as for medical images, one may have some images with masks drawn by a human expert, indicating a region of the image containing relevant information to make the prediction. We study multiple methods that take advantage of such auxiliary labels, by training networks to ignore distracting features which may be found outside of the region of interest. This mask information is only used during training and has an impact on generalization accuracy depending on the severity of the shift between the training and test distributions. Surprisingly, while these methods improve generalization performance in the presence of a covariate shift, there is no strong correspondence between the correction of attribution towards the features a human expert have labelled as important and generalization performance. These results suggest that the root cause of poor generalization may not always be spatially defined, and raise questions about the utility of masks as 'attribution priors' as well as saliency maps for explainable predictions.",
+    "title": "Saliency is a Possible Red Herring When Diagnosing Poor Generalization",
+    "authors": [
+      "Joseph D Viviano",
+      "Becks Simpson",
+      "Francis Dutil",
+      "Yoshua Bengio",
+      "Joseph Paul Cohen"
+    ],
+    "emails": [
+      "~Francis_Dutil1",
+      "~Becks_Simpson1",
+      "~Yoshua_Bengio1",
+      "~Joseph_Paul_Cohen1",
+      "~Joseph_D_Viviano1"
+    ],
+    "rank": 287
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ig53hpHxS4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5,
+      9
+    ],
+    "rating": "6.75",
+    "confidences": [
+      5,
+      3,
+      3,
+      5
+    ],
+    "abstract": "In this paper we propose Flowtron: an autoregressive flow-based generative network for text-to-speech synthesis with style transfer and speech variation. Flowtron borrows insights from Autoregressive Flows and revamps Tacotron 2 in order to provide high-quality and expressive mel-spectrogram synthesis. Flowtron is optimized by maximizing the likelihood of the training data, which makes training simple and stable. Flowtron learns an invertible mapping of data to a latent space that can be used to modulate many aspects of speech synthesis (timbre, expressivity, accent). Our mean opinion scores (MOS) show that Flowtron matches state-of-the-art TTS models in terms of speech quality. We provide results on speech variation, interpolation over time between samples and style transfer between seen and unseen speakers. Code and pre-trained models are publicly available at \\href{<a href=\"https://github.com/NVIDIA/flowtron}{https://github.com/NVIDIA/flowtron}\" target=\"_blank\" rel=\"nofollow\">https://github.com/NVIDIA/flowtron}{https://github.com/NVIDIA/flowtron}</a>.",
+    "title": "Flowtron: an Autoregressive Flow-based Generative Network for Text-to-Speech Synthesis",
+    "authors": [
+      "Rafael Valle",
+      "Kevin J. Shih",
+      "Ryan Prenger",
+      "Bryan Catanzaro"
+    ],
+    "emails": [
+      "~Rafael_Valle1",
+      "nvidia.com",
+      "~Kevin_J._Shih1",
+      "~Bryan_Catanzaro1"
+    ],
+    "rank": 288
+  },
+  {
+    "url": "https://openreview.net/forum?id=nkIDwI6oO4_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Solving continuous minimax optimization is of extensive practical interest, yet notoriously unstable and difficult. This paper introduces the learning to optimize(L2O) methodology to the minimax problems for the first time and addresses its accompanying unique challenges. We first present Twin-L2O, the first dedicated minimax L2O method consisting of two LSTMs for updating min and max variables separately. The decoupled design is found to facilitate learning, particularly when the min and max variables are highly asymmetric. Empirical experiments on a variety of minimax problems corroborate the effectiveness of Twin-L2O. We then discuss a crucial concern of Twin-L2O, i.e., its inevitably limited generalizability to unseen optimizees. To address this issue, we present two complementary strategies. Our first solution, Enhanced Twin-L2O, is empirically applicable for general minimax problems, by improving L2O training via leveraging curriculum learning. Our second alternative, called Safeguarded Twin-L2O, is a preliminary theoretical exploration stating that under some strong assumptions, it is possible to theoretically establish the convergence of Twin-L2O. We benchmark our algorithms on several testbed problems and compare against state-of-the-art minimax solvers. The code is available at:  <a href=\"https://github.com/VITA-Group/L2O-Minimax\" target=\"_blank\" rel=\"nofollow\">https://github.com/VITA-Group/L2O-Minimax</a>.",
+    "title": "Learning A Minimax Optimizer: A Pilot Study",
+    "authors": [
+      "Jiayi Shen",
+      "Xiaohan Chen",
+      "Howard Heaton",
+      "Tianlong Chen",
+      "Jialin Liu",
+      "Wotao Yin",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Wotao_Yin1",
+      "~Zhangyang_Wang1",
+      "~Tianlong_Chen1",
+      "~Xiaohan_Chen1",
+      "~Jiayi_Shen1",
+      "~Jialin_Liu1",
+      "~Howard_Heaton2"
+    ],
+    "rank": 289
+  },
+  {
+    "url": "https://openreview.net/forum?id=yr1mzrH3IC",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Deep Reinforcement Learning (Deep RL) has been receiving increasingly more attention  thanks to its encouraging performance on a variety of control tasks. Yet, conventional regularization techniques in training neural networks (e.g., <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> regularization, dropout) have been largely ignored in RL methods, possibly because agents are typically trained and evaluated in the same environment, and because the deep RL community focuses more on high-level algorithm designs. In this work, we present the first comprehensive study of regularization techniques with multiple policy optimization algorithms on continuous control tasks. Interestingly, we find conventional regularization techniques on the policy networks can often bring large improvement, especially on harder tasks. Our findings are shown to be robust against training hyperparameter variations. We also compare these techniques with the more widely used entropy regularization. In addition, we study regularizing different components and find that only regularizing the policy network is typically the best. We further analyze why regularization may help generalization in RL from four perspectives - sample complexity, reward distribution, weight norm, and noise robustness. We hope our study provides guidance for future practices in regularizing policy optimization algorithms. Our code is available at <a href=\"https://github.com/xuanlinli17/iclr2021_rlreg\" target=\"_blank\" rel=\"nofollow\">https://github.com/xuanlinli17/iclr2021_rlreg</a> .",
+    "title": "Regularization Matters in Policy Optimization - An Empirical Study on Continuous Control",
+    "authors": [
+      "Zhuang Liu",
+      "Xuanlin Li",
+      "Bingyi Kang",
+      "Trevor Darrell"
+    ],
+    "emails": [
+      "~Bingyi_Kang1",
+      "~Zhuang_Liu1",
+      "~Trevor_Darrell2",
+      "~Xuanlin_Li1"
+    ],
+    "rank": 290
+  },
+  {
+    "url": "https://openreview.net/forum?id=v9hAX77--cZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      8
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "While most neural generative models generate outputs in a single pass, the human creative process is usually one of iterative building and refinement. Recent work has proposed models of editing processes, but these mostly focus on editing sequential data and/or only model a single editing pass. In this paper, we present a generic model for incremental editing of structured data (i.e. ''structural edits''). Particularly, we focus on tree-structured data, taking abstract syntax trees of computer programs as our canonical example. Our editor learns to iteratively generate tree edits (e.g. deleting or adding a subtree) and applies them to the partially edited data, thereby the entire editing process can be formulated as consecutive, incremental tree transformations. To show the unique benefits of modeling tree edits directly, we further propose a novel edit encoder for learning to represent edits, as well as an imitation learning method that allows the editor to be more robust. We evaluate our proposed editor on two source code edit datasets, where results show that, with the proposed edit encoder, our editor significantly improves accuracy over previous approaches that generate the edited program directly in one pass. Finally, we demonstrate that training our editor to imitate experts and correct its mistakes dynamically can further improve its performance.",
+    "title": "Learning Structural Edits via Incremental Tree Transformations",
+    "authors": [
+      "Ziyu Yao",
+      "Frank F. Xu",
+      "Pengcheng Yin",
+      "Huan Sun",
+      "Graham Neubig"
+    ],
+    "emails": [
+      "~Ziyu_Yao1",
+      "~Frank_F._Xu1",
+      "~Huan_Sun1",
+      "~Pengcheng_Yin1",
+      "~Graham_Neubig1"
+    ],
+    "rank": 291
+  },
+  {
+    "url": "https://openreview.net/forum?id=qZzy5urZw9",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "A recent study (Rice et al.,  2020) revealed overfitting to be a dominant phenomenon in adversarially robust training of deep networks, and that appropriate early-stopping of adversarial training (AT) could match the performance gains of most recent algorithmic improvements. This intriguing problem of robust overfitting motivates us to seek more remedies. As a pilot study, this paper investigates two empirical means to inject more learned smoothening during AT: one leveraging knowledge distillation and self-training to smooth the logits, the other performing stochastic weight averaging (Izmailov et al., 2018) to smooth the weights. Despite the embarrassing simplicity, the two approaches are surprisingly effective and hassle-free in mitigating robust overfitting. Experiments demonstrate that by plugging in them to AT, we can simultaneously boost the standard accuracy by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3.72</mn><mi mathvariant=\"normal\">%</mi><mo>\u223c</mo><mn>6.68</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and robust accuracy by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.22</mn><mi mathvariant=\"normal\">%</mi><mo>\u223c</mo><mn>2</mn><mn>.03</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>, across multiple datasets (STL-10, SVHN, CIFAR-10, CIFAR-100, and Tiny ImageNet), perturbation types (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>), and robustified methods (PGD, TRADES, and FSGM), establishing the new state-of-the-art bar in AT. We present systematic visualizations and analyses to dive into their possible working mechanisms. We also carefully exclude the possibility of gradient masking by evaluating our models' robustness against transfer attacks. Codes are available at <a href=\"https://github.com/VITA-Group/Alleviate-Robust-Overfitting\" target=\"_blank\" rel=\"nofollow\">https://github.com/VITA-Group/Alleviate-Robust-Overfitting</a>.",
+    "title": "Robust Overfitting may be mitigated by properly learned smoothening",
+    "authors": [
+      "Tianlong Chen",
+      "Zhenyu Zhang",
+      "Sijia Liu",
+      "Shiyu Chang",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Shiyu_Chang2",
+      "~Zhangyang_Wang1",
+      "~Tianlong_Chen1",
+      "~Sijia_Liu1",
+      "~Zhenyu_Zhang4"
+    ],
+    "rank": 292
+  },
+  {
+    "url": "https://openreview.net/forum?id=LSFCEb3GYU7",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "A key aspect of human intelligence is the ability to infer abstract rules directly from high-dimensional sensory data, and to do so given only a limited amount of training experience. Deep neural network algorithms have proven to be a powerful tool for learning directly from high-dimensional data, but currently lack this capacity for data-efficient induction of abstract rules, leading some to argue that symbol-processing mechanisms will be necessary to account for this capacity. In this work, we take a step toward bridging this gap by introducing the Emergent Symbol Binding Network (ESBN), a recurrent network augmented with an external memory that enables a form of variable-binding and indirection. This binding mechanism allows symbol-like representations to emerge through the learning process without the need to explicitly incorporate symbol-processing machinery, enabling the ESBN to learn rules in a manner that is abstracted away from the particular entities to which those rules apply. Across a series of tasks, we show that this architecture displays nearly perfect generalization of learned rules to novel entities given only a limited number of training examples, and outperforms a number of other competitive neural network architectures.",
+    "title": "Emergent Symbols through Binding in External Memory",
+    "authors": [
+      "Taylor Whittington Webb",
+      "Ishan Sinha",
+      "Jonathan Cohen"
+    ],
+    "emails": [
+      "~Taylor_Whittington_Webb1",
+      "gmail.com",
+      "~Jonathan_Cohen1"
+    ],
+    "rank": 293
+  },
+  {
+    "url": "https://openreview.net/forum?id=QpNz8r_Ri2Y",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "One of the main challenges in offline and off-policy reinforcement learning is to cope with the distribution shift that arises from the mismatch between the target policy and the data collection policy. In this paper, we focus on a model-based approach, particularly on learning the representation for a robust model of the environment under the distribution shift, which has been first studied by Representation Balancing MDP (RepBM). Although this prior work has shown promising results, there are a number of shortcomings that still hinder its applicability to practical tasks. In particular, we address the curse of horizon exhibited by RepBM, rejecting most of the pre-collected data in long-term tasks. We present a new objective for model learning motivated by recent advances in the estimation of stationary distribution corrections. This effectively overcomes the aforementioned limitation of RepBM, as well as naturally extending to continuous action spaces and stochastic policies. We also present an offline model-based policy optimization using this new objective, yielding the state-of-the-art performance in a representative set of benchmark offline RL tasks.",
+    "title": "Representation Balancing Offline Model-based Reinforcement Learning",
+    "authors": [
+      "Byung-Jun Lee",
+      "Jongmin Lee",
+      "Kee-Eung Kim"
+    ],
+    "emails": [
+      "~Jongmin_Lee1",
+      "~Byung-Jun_Lee1",
+      "~Kee-Eung_Kim4"
+    ],
+    "rank": 294
+  },
+  {
+    "url": "https://openreview.net/forum?id=09-528y2Fgf",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this work, we investigate the positional encoding methods used in language pre-training (e.g., BERT) and identify several problems in the existing formulations. First, we show that in the absolute positional encoding, the addition operation applied on positional embeddings and word embeddings brings mixed correlations between the two heterogeneous information resources. It may bring unnecessary randomness in the attention and further limit the expressiveness of the model.  Second, we question whether treating the position of the symbol \\texttt{[CLS]} the same as other words is a reasonable design, considering its special role (the representation of the entire sentence) in the downstream tasks. Motivated from above analysis, we propose a new positional encoding method called \\textbf{T}ransformer with \\textbf{U}ntied \\textbf{P}ositional \\textbf{E}ncoding (TUPE). In the self-attention module, TUPE computes the word contextual correlation and positional correlation separately with different parameterizations and then adds them together. This design removes the mixed and noisy correlations over heterogeneous embeddings and offers more expressiveness by using different projection matrices. Furthermore, TUPE unties the \\texttt{[CLS]} symbol from other positions, making it easier to capture information from all positions. Extensive experiments and ablation studies on GLUE benchmark demonstrate the effectiveness of the proposed method. Codes and models are released at \\url{<a href=\"https://github.com/guolinke/TUPE}\" target=\"_blank\" rel=\"nofollow\">https://github.com/guolinke/TUPE}</a>.",
+    "title": "Rethinking Positional Encoding in Language Pre-training",
+    "authors": [
+      "Guolin Ke",
+      "Di He",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Guolin_Ke3",
+      "~Di_He1",
+      "~Tie-Yan_Liu1"
+    ],
+    "rank": 295
+  },
+  {
+    "url": "https://openreview.net/forum?id=e12NDM7wkEY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Clustering is one of the most fundamental tasks in machine learning. Recently, deep clustering has become a major trend in clustering techniques. Representation learning often plays an important role in the effectiveness of deep clustering, and thus can be a principal cause of performance degradation. In this paper, we propose a clustering-friendly representation learning method using instance discrimination and feature decorrelation. Our deep-learning-based representation learning method is motivated by the properties of classical spectral clustering. Instance discrimination learns similarities among data and feature decorrelation removes redundant correlation among features. We utilize an instance discrimination method in which learning individual instance classes leads to learning similarity among instances. Through detailed experiments and examination, we show that the approach can be adapted to learning a latent space for clustering. We design novel softmax-formulated decorrelation constraints for learning. In evaluations of image clustering using CIFAR-10 and ImageNet-10, our method achieves accuracy of 81.5% and 95.4%, respectively. We also show that the softmax-formulated constraints are compatible with various neural networks.",
+    "title": "Clustering-friendly Representation Learning via Instance Discrimination and Feature Decorrelation",
+    "authors": [
+      "Yaling Tao",
+      "Kentaro Takagi",
+      "Kouta Nakata"
+    ],
+    "emails": [
+      "~Yaling_Tao1",
+      "toshiba.co.jp"
+    ],
+    "rank": 296
+  },
+  {
+    "url": "https://openreview.net/forum?id=US-TP-xnXI",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      8,
+      6,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a new framework, Translation between Augmented Natural Languages (TANL), to solve many structured prediction language tasks including joint entity and relation extraction, nested named entity recognition, relation classification, semantic role labeling, event extraction, coreference resolution, and dialogue state tracking. Instead of tackling the problem by training task-specific discriminative classifiers, we frame it as a translation task between augmented natural languages, from which the task-relevant information can be easily extracted. Our approach can match or outperform task-specific models on all tasks, and in particular achieves new state-of-the-art results on joint entity and relation extraction (CoNLL04, ADE, NYT, and ACE2005 datasets), relation classification (FewRel and TACRED), and semantic role labeling (CoNLL-2005 and CoNLL-2012). We accomplish this while using the same architecture and hyperparameters for all tasks, and even when training a single model to solve all tasks at the same time (multi-task learning). Finally, we show that our framework can also significantly improve the performance in a low-resource regime, thanks to better use of label semantics.",
+    "title": "Structured Prediction as Translation between Augmented Natural Languages",
+    "authors": [
+      "Giovanni Paolini",
+      "Ben Athiwaratkun",
+      "Jason Krone",
+      "Jie Ma",
+      "Alessandro Achille",
+      "RISHITA ANUBHAI",
+      "Cicero Nogueira dos Santos",
+      "Bing Xiang",
+      "Stefano Soatto"
+    ],
+    "emails": [
+      "~Ben_Athiwaratkun1",
+      "~Bing_Xiang2",
+      "~Jie_Ma3",
+      "~Alessandro_Achille1",
+      "~RISHITA_ANUBHAI2",
+      "amazon.com",
+      "~Cicero_Nogueira_dos_Santos1",
+      "~Giovanni_Paolini1",
+      "~Stefano_Soatto1"
+    ],
+    "rank": 297
+  },
+  {
+    "url": "https://openreview.net/forum?id=3k20LAiHYL2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      8,
+      8
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Pretrained language models (PTLM) have achieved impressive results in a range of natural language understanding (NLU) and generation (NLG) tasks that require a syntactic and semantic understanding of the text. However, current pre-training objectives such as masked token prediction (for BERT-style PTLMs) and masked span infilling (for T5-style PTLMs) do not explicitly model the relational and compositional commonsense knowledge about everyday concepts, which is crucial to many downstream tasks requiring commonsense reasoning. To augment PTLMs with common sense, we propose generative and contrastive objectives as intermediate self-supervised pre-training tasks between general pre-training and downstream task-specific fine-tuning. We also propose a joint training framework to unify generative and contrastive objectives so that these objectives can be more effective.\nOur proposed objectives can pack more commonsense knowledge into the parameters of a pre-trained text-to-text transformer without relying on external knowledge bases, yielding better performance on both NLU and NLG tasks. We apply our method on a pre-trained T5 model in an intermediate task transfer learning fashion to train a concept-aware language model (CALM) and experiment with five commonsense benchmarks (four NLU tasks and one NLG task). Experimental results show that CALM outperforms baseline methods by a consistent margin.",
+    "title": "Pre-training Text-to-Text Transformers for Concept-centric Common Sense",
+    "authors": [
+      "Wangchunshu Zhou",
+      "Dong-Ho Lee",
+      "Ravi Kiran Selvam",
+      "Seyeon Lee",
+      "Xiang Ren"
+    ],
+    "emails": [
+      "~Seyeon_Lee1",
+      "~Wangchunshu_Zhou1",
+      "~Dong-Ho_Lee1",
+      "~Xiang_Ren1",
+      "~Ravi_Kiran_Selvam1"
+    ],
+    "rank": 298
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ovp8dvB8IBH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Data augmentation is often used to enlarge datasets with synthetic samples generated in accordance with the underlying data distribution. To enable a wider range of augmentations, we explore negative data augmentation strategies (NDA) that intentionally create out-of-distribution samples. We show that such negative out-of-distribution samples provide information on the support of the data distribution,  and can be leveraged for generative modeling and representation learning. We introduce a new GAN training objective where we use NDA as an additional source of synthetic data for the discriminator. We prove that under suitable conditions, optimizing the resulting objective still recovers the true data distribution but can directly bias the generator towards avoiding samples that lack the desired structure. Empirically, models trained with our method achieve improved conditional/unconditional image generation along with improved anomaly detection capabilities. Further, we incorporate the same negative data augmentation strategy in a contrastive learning framework for self-supervised representation learning on images and videos, achieving improved performance on downstream image classification, object detection, and action recognition tasks. These results suggest that prior knowledge on what does not constitute valid data is an effective form of weak supervision across a range of unsupervised learning tasks.",
+    "title": "Negative Data Augmentation ",
+    "authors": [
+      "Abhishek Sinha",
+      "Kumar Ayush",
+      "Jiaming Song",
+      "Burak Uzkent",
+      "Hongxia Jin",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Jiaming_Song1",
+      "~Burak_Uzkent1",
+      "~Kumar_Ayush2",
+      "~Hongxia_Jin1",
+      "~Abhishek_Sinha1",
+      "~Stefano_Ermon1"
+    ],
+    "rank": 299
+  },
+  {
+    "url": "https://openreview.net/forum?id=AY8zfZm0tDd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Using a high Update-To-Data (UTD) ratio, model-based methods have recently achieved much higher sample efficiency than previous model-free methods for continuous-action DRL benchmarks. In this paper, we introduce a simple model-free algorithm, Randomized Ensembled Double Q-Learning (REDQ), and show that its performance is just as good as, if not better than, a state-of-the-art model-based algorithm for the MuJoCo benchmark. Moreover, REDQ can achieve this performance using fewer parameters than the model-based method, and with less wall-clock run time. REDQ has three carefully integrated ingredients which allow it to achieve its high performance: (i) a UTD ratio <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c226B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u226b</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container>; (ii) an ensemble of Q functions; (iii) in-target minimization across a random subset of Q functions from the ensemble. Through carefully designed experiments, we provide a detailed analysis of REDQ and related model-free algorithms. To our knowledge, REDQ is the first successful model-free DRL algorithm for continuous-action spaces using a UTD ratio <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c226B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u226b</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container>. ",
+    "title": "Randomized Ensembled Double Q-Learning: Learning Fast Without a Model",
+    "authors": [
+      "Xinyue Chen",
+      "Che Wang",
+      "Zijian Zhou",
+      "Keith W. Ross"
+    ],
+    "emails": [
+      "~Che_Wang1",
+      "~Zijian_Zhou1",
+      "~Keith_W._Ross1",
+      "~Xinyue_Chen1"
+    ],
+    "rank": 300
+  },
+  {
+    "url": "https://openreview.net/forum?id=TQt98Ya7UMP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deploying Reinforcement Learning (RL) agents to solve real-world applications often requires satisfying complex system constraints. Often the constraint thresholds are incorrectly set due to the complex nature of a system or the inability to verify the thresholds offline (e.g, no simulator or reasonable offline evaluation procedure exists). This results in solutions where a task cannot be solved without violating the constraints. However, in many real-world cases, constraint violations are undesirable yet they are not catastrophic, motivating the need for soft-constrained RL approaches. We present two soft-constrained RL approaches that utilize meta-gradients to find a good trade-off between expected return and minimizing constraint violations. We demonstrate the effectiveness of these approaches by showing that they consistently outperform the baselines across four different Mujoco domains.",
+    "title": "Balancing Constraints and Rewards with Meta-Gradient D4PG",
+    "authors": [
+      "Dan A. Calian",
+      "Daniel J Mankowitz",
+      "Tom Zahavy",
+      "Zhongwen Xu",
+      "Junhyuk Oh",
+      "Nir Levine",
+      "Timothy Mann"
+    ],
+    "emails": [
+      "~Nir_Levine2",
+      "~Zhongwen_Xu1",
+      "~Tom_Zahavy2",
+      "~Daniel_J_Mankowitz2",
+      "google.com",
+      "~Junhyuk_Oh2",
+      "~Timothy_Mann1"
+    ],
+    "rank": 301
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rld-9OxQ6HU",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "The success of Convolutional Neural Networks (CNNs) in computer vision is mainly driven by their strong inductive bias,  which is strong enough to allow CNNs to solve vision-related tasks with random weights, meaning without learning. Similarly, Long Short-Term Memory (LSTM) has a strong inductive bias towards storing information over time. However, many real-world systems are governed by conservation laws, which lead to the redistribution of particular quantities \u2014e.g. in physical and economical systems.   Our novel Mass-Conserving LSTM (MC-LSTM) adheres to these conservation laws by extending the inductive bias of LSTM to model the redistribution of those stored quantities. MC-LSTMs set a new state-of-the-art for neural arithmetic units at learning arithmetic operations, such as addition tasks, which have a strong conservation law, as the sum is constant overtime. Further, MC-LSTM is applied to traffic forecasting, modeling a pendulum, and a large benchmark dataset in hydrology, where it sets a new state-of-the-art for predicting peak flows. In the hydrology example, we show that MC-LSTM states correlate with real world processes and are therefore interpretable.",
+    "title": "MC-LSTM: Mass-conserving LSTM",
+    "authors": [
+      "Pieter-Jan Hoedt",
+      "Frederik Kratzert",
+      "Daniel Klotz",
+      "Christina Halmich",
+      "Markus Holzleitner",
+      "Grey Nearing",
+      "Sepp Hochreiter",
+      "G\u00fcnter Klambauer"
+    ],
+    "emails": [
+      "ml.jku.at",
+      "~Markus_Holzleitner1",
+      "~Pieter-Jan_Hoedt1",
+      "~Daniel_Klotz1",
+      "~G\u00fcnter_Klambauer1",
+      "google.com",
+      "~Frederik_Kratzert1",
+      "~Sepp_Hochreiter1"
+    ],
+    "rank": 302
+  },
+  {
+    "url": "https://openreview.net/forum?id=piLPYqxtWuA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      8,
+      7,
+      7
+    ],
+    "rating": "6.74",
+    "confidences": [
+      5,
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Non-autoregressive text to speech (TTS) models such as FastSpeech can synthesize speech significantly faster than previous autoregressive models with comparable quality. The training of FastSpeech model relies on an autoregressive teacher model for duration prediction (to provide more information as input) and knowledge distillation (to simplify the data distribution in output), which can ease the one-to-many mapping problem (i.e., multiple speech variations correspond to the same text) in TTS. However, FastSpeech has several disadvantages: 1) the teacher-student distillation pipeline is complicated and time-consuming, 2) the duration extracted from the teacher model is not accurate enough, and the target mel-spectrograms distilled from teacher model suffer from information loss due to data simplification, both of which limit the voice quality. In this paper, we propose FastSpeech 2, which addresses the issues in FastSpeech and better solves the one-to-many mapping problem in TTS by 1) directly training the model with ground-truth target instead of the simplified output from teacher, and 2) introducing more variation information of speech (e.g., pitch, energy and more accurate duration) as conditional inputs. Specifically, we extract duration, pitch and energy from speech waveform and directly take them as conditional inputs in training and use predicted values in inference. We further design FastSpeech 2s, which is the first attempt to directly generate speech waveform from text in parallel, enjoying the benefit of fully end-to-end inference. Experimental results show that 1) FastSpeech 2 achieves a 3x training speed-up over FastSpeech, and FastSpeech 2s enjoys even faster inference speed; 2) FastSpeech 2 and 2s outperform FastSpeech in voice quality, and FastSpeech 2 can even surpass autoregressive models. Audio samples are available at <a href=\"https://speechresearch.github.io/fastspeech2/\" target=\"_blank\" rel=\"nofollow\">https://speechresearch.github.io/fastspeech2/</a>.",
+    "title": "FastSpeech 2: Fast and High-Quality End-to-End Text to Speech",
+    "authors": [
+      "Yi Ren",
+      "Chenxu Hu",
+      "Xu Tan",
+      "Tao Qin",
+      "Sheng Zhao",
+      "Zhou Zhao",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Xu_Tan1",
+      "~Zhou_Zhao2",
+      "~Chenxu_Hu1",
+      "microsoft.com",
+      "~Tie-Yan_Liu1",
+      "~Yi_Ren2"
+    ],
+    "rank": 303
+  },
+  {
+    "url": "https://openreview.net/forum?id=Pz_dcqfcKW8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.74",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Streaming automatic speech recognition (ASR) aims to emit each hypothesized word as quickly and accurately as possible, while full-context ASR waits for the completion of a full speech utterance before emitting completed hypotheses. In this work, we propose a unified framework, Dual-mode ASR, to train a single end-to-end ASR model with shared weights for both streaming and full-context speech recognition. We show that the latency and accuracy of streaming ASR significantly benefit from weight sharing and joint training of full-context ASR, especially with inplace knowledge distillation during the training. The Dual-mode ASR framework can be applied to recent state-of-the-art convolution-based and transformer-based ASR networks. We present extensive experiments with two state-of-the-art ASR networks, ContextNet and Conformer, on two datasets, a widely used public dataset LibriSpeech and a large-scale dataset MultiDomain. Experiments and ablation studies demonstrate that Dual-mode ASR not only simplifies the workflow of training and deploying streaming and full-context ASR models, but also significantly improves both emission latency and recognition accuracy of streaming ASR. With Dual-mode ASR, we achieve new state-of-the-art streaming ASR results on both LibriSpeech and MultiDomain in terms of accuracy and latency.",
+    "title": "Dual-mode ASR: Unify and Improve Streaming ASR with Full-context Modeling",
+    "authors": [
+      "Jiahui Yu",
+      "Wei Han",
+      "Anmol Gulati",
+      "Chung-Cheng Chiu",
+      "Bo Li",
+      "Tara N Sainath",
+      "Yonghui Wu",
+      "Ruoming Pang"
+    ],
+    "emails": [
+      "~Yonghui_Wu1",
+      "~Ruoming_Pang1",
+      "~Tara_N_Sainath1",
+      "~Jiahui_Yu1",
+      "~Anmol_Gulati1",
+      "~Chung-Cheng_Chiu1",
+      "~Wei_Han3",
+      "~Bo_Li1"
+    ],
+    "rank": 304
+  },
+  {
+    "url": "https://openreview.net/forum?id=TuK6agbdt27",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Humans can quickly associate stimuli to solve problems in novel contexts. Our novel neural network model learns state representations of facts that can be composed to perform such associative inference. To this end, we augment the LSTM model with an associative memory, dubbed \\textit{Fast Weight Memory} (FWM). Through differentiable operations at every step of a given input sequence, the LSTM \\textit{updates and maintains} compositional associations stored in the rapidly changing FWM weights. Our model is trained end-to-end by gradient descent and yields excellent performance on compositional language reasoning problems, meta-reinforcement-learning for POMDPs, and small-scale word-level language modelling.",
+    "title": "Learning Associative Inference Using Fast Weight Memory",
+    "authors": [
+      "Imanol Schlag",
+      "Tsendsuren Munkhdalai",
+      "J\u00fcrgen Schmidhuber"
+    ],
+    "emails": [
+      "~J\u00fcrgen_Schmidhuber1",
+      "~Imanol_Schlag3",
+      "~Tsendsuren_Munkhdalai1"
+    ],
+    "rank": 305
+  },
+  {
+    "url": "https://openreview.net/forum?id=3hGNqpI4WS",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      8
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Most reinforcement learning (RL) algorithms assume online access to the environment, in which one may readily interleave updates to the policy with experience collection using that policy. However, in many real-world applications such as health, education, dialogue agents, and robotics, the cost or potential risk of deploying a new data-collection policy is high, to the point that it can become prohibitive to update the data-collection policy more than a few times during learning. With this view, we propose a novel concept of deployment efficiency, measuring the number of distinct data-collection policies that are used during policy learning. We observe that na\u00efvely applying existing model-free offline RL algorithms recursively does not lead to a practical deployment-efficient and sample-efficient algorithm. We propose a novel model-based algorithm, Behavior-Regularized Model-ENsemble (BREMEN), that not only performs better than or comparably as the state-of-the-art dynamic-programming-based and concurrently-proposed model-based offline approaches on existing benchmarks, but can also effectively optimize a policy offline using 10-20 times fewer data than prior works. Furthermore, the recursive application of BREMEN achieves impressive deployment efficiency while maintaining the same or better sample efficiency, learning successful policies from scratch on simulated robotic environments with only 5-10 deployments, compared to typical values of hundreds to millions in standard RL baselines.",
+    "title": "Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization",
+    "authors": [
+      "Tatsuya Matsushima",
+      "Hiroki Furuta",
+      "Yutaka Matsuo",
+      "Ofir Nachum",
+      "Shixiang Gu"
+    ],
+    "emails": [
+      "~Ofir_Nachum1",
+      "~Shixiang_Gu1",
+      "~Hiroki_Furuta1",
+      "~Tatsuya_Matsushima1",
+      "~Yutaka_Matsuo1"
+    ],
+    "rank": 306
+  },
+  {
+    "url": "https://openreview.net/forum?id=eqBwg3AcIAK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We propose a simple, practical, and intuitive approach for domain adaptation in reinforcement learning. Our approach stems from the idea that the agent's experience in the source domain should look similar to its experience in the target domain. Building off of a probabilistic view of RL, we achieve this goal by compensating for the difference in dynamics by modifying the reward function. This modified reward function is simple to estimate by learning auxiliary classifiers that distinguish source-domain transitions from target-domain transitions. Intuitively, the agent is penalized for transitions that would indicate that the agent is interacting with the source domain, rather than the target domain. Formally, we prove that applying our method in the source domain is guaranteed to obtain a near-optimal policy for the target domain, provided that the source and target domains satisfy a lightweight assumption. Our approach is applicable to domains with continuous states and actions and does not require learning an explicit model of the dynamics. On discrete and continuous control tasks, we illustrate the mechanics of our approach and demonstrate its scalability to high-dimensional~tasks.",
+    "title": "Off-Dynamics Reinforcement Learning: Training for Transfer with Domain Classifiers",
+    "authors": [
+      "Benjamin Eysenbach",
+      "Shreyas Chaudhari",
+      "Swapnil Asawa",
+      "Sergey Levine",
+      "Ruslan Salakhutdinov"
+    ],
+    "emails": [
+      "~Shreyas_Chaudhari1",
+      "~Benjamin_Eysenbach1",
+      "~Swapnil_Asawa1",
+      "~Ruslan_Salakhutdinov1",
+      "~Sergey_Levine1"
+    ],
+    "rank": 307
+  },
+  {
+    "url": "https://openreview.net/forum?id=XLfdzwNKzch",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Backward locking and update locking are well-known sources of inefficiency in backpropagation that prevent from concurrently updating layers. Several works have recently suggested using local error signals to train network blocks asynchronously to overcome these limitations. However, they often require numerous iterations of trial-and-error to find the best configuration for local training, including how to decouple network blocks and which auxiliary networks to use for each block. In this work, we propose a differentiable search algorithm named SEDONA to automate this process. Experimental results show that our algorithm can consistently discover transferable decoupled architectures for VGG and ResNet variants, and significantly outperforms the ones trained with end-to-end backpropagation and other state-of-the-art greedy-leaning methods in CIFAR-10, Tiny-ImageNet and ImageNet.",
+    "title": "SEDONA: Search for Decoupled Neural Networks toward Greedy Block-wise Learning",
+    "authors": [
+      "Myeongjang Pyeon",
+      "Jihwan Moon",
+      "Taeyoung Hahn",
+      "Gunhee Kim"
+    ],
+    "emails": [
+      "~Gunhee_Kim1",
+      "~Myeongjang_Pyeon1",
+      "~Taeyoung_Hahn1",
+      "~Jihwan_Moon2"
+    ],
+    "rank": 308
+  },
+  {
+    "url": "https://openreview.net/forum?id=1OP1kReyL56",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Transformers that are pre-trained on multilingual text corpora, such as, mBERT and XLM-RoBERTa, have achieved impressive cross-lingual transfer learning results.  In the zero-shot cross-lingual transfer setting, only English training data is assumed, and the fine-tuned model is evaluated on another target language.  No target-language validation data is assumed in this setting, however substantial variance has been observed in target language performance between different fine-tuning runs.  Prior work has relied on English validation/development data to select among models that are fine-tuned with different learning rates, number of steps and other hyperparameters, often resulting in suboptimal choices.  In this paper, we show that it is possible to select consistently better models when small amounts of annotated data are available in an auxiliary pivot language.\nWe propose a machine learning approach to model selection that uses the fine-tuned model's own internal representations to predict its cross-lingual capabilities.  In extensive experiments we find that our approach consistently selects better models than English validation data across five languages and five well-studied NLP tasks, achieving results that are comparable to small amounts of target language development data.",
+    "title": "Model Selection for Cross-Lingual Transfer using a Learned Scoring Function",
+    "authors": [
+      "Yang Chen",
+      "Alan Ritter"
+    ],
+    "emails": [
+      "~Yang_Chen10",
+      "~Alan_Ritter1"
+    ],
+    "rank": 309
+  },
+  {
+    "url": "https://openreview.net/forum?id=NTEz-6wysdb",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The task of information retrieval is an important component of many natural language processing systems, such as open domain question answering. While traditional methods were based on hand-crafted features, continuous representations based on neural networks recently obtained competitive results. A challenge of using such methods is to obtain supervised data to train the retriever model, corresponding to pairs of query and support documents. In this paper, we propose a technique to learn retriever models for downstream tasks, inspired by knowledge distillation, and which does not require annotated pairs of query and documents. Our approach leverages attention scores of a reader model, used to solve the task based on retrieved documents, to obtain synthetic labels for the retriever. We evaluate our method on question answering, obtaining state-of-the-art results.",
+    "title": "Distilling Knowledge from Reader to Retriever for Question Answering",
+    "authors": [
+      "Gautier Izacard",
+      "Edouard Grave"
+    ],
+    "emails": [
+      "~Gautier_Izacard1",
+      "~Edouard_Grave1"
+    ],
+    "rank": 310
+  },
+  {
+    "url": "https://openreview.net/forum?id=cR91FAodFMe",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In audio-visual navigation, an agent intelligently travels through a complex, unmapped 3D environment using both sights and sounds to find a sound source (e.g., a phone ringing in another room). Existing models learn to act at a fixed granularity of agent motion and rely on simple recurrent aggregations of the audio observations. We introduce a reinforcement learning approach to audio-visual navigation with two key novel elements: 1) waypoints that are dynamically set and learned end-to-end within the navigation policy, and 2) an acoustic memory that provides a structured, spatially grounded record of what the agent has heard as it moves. Both new ideas capitalize on the synergy of audio and visual data for revealing the geometry of an unmapped space. We demonstrate our approach on two challenging datasets of real-world 3D scenes, Replica and Matterport3D. Our model improves the state of the art by a substantial margin, and our experiments reveal that learning the links between sights, sounds, and space is essential for audio-visual navigation.",
+    "title": "Learning to Set Waypoints for Audio-Visual Navigation",
+    "authors": [
+      "Changan Chen",
+      "Sagnik Majumder",
+      "Ziad Al-Halah",
+      "Ruohan Gao",
+      "Santhosh Kumar Ramakrishnan",
+      "Kristen Grauman"
+    ],
+    "emails": [
+      "~Santhosh_Kumar_Ramakrishnan1",
+      "~Kristen_Grauman1",
+      "~Sagnik_Majumder1",
+      "~Ruohan_Gao2",
+      "~Ziad_Al-Halah2",
+      "~Changan_Chen2"
+    ],
+    "rank": 311
+  },
+  {
+    "url": "https://openreview.net/forum?id=vQzcqQWIS0q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "The embedding-based representation learning is commonly used in deep learning recommendation models to map the raw sparse features to dense vectors. The traditional embedding manner that assigns a uniform size to all features has two issues. First, the numerous features inevitably lead to a gigantic embedding table that causes a high memory usage cost. Second, it is likely to cause the over-fitting problem for those features that do not require too large representation capacity. Existing works that try to address the problem always cause a significant drop in recommendation performance or suffers from the limitation of unaffordable training time cost. In this paper, we proposed a novel approach, named PEP (short for Plug-in Embedding Pruning), to reduce the size of the embedding table while avoiding the drop of recommendation accuracy. PEP prunes embedding parameter where the pruning threshold(s) can be adaptively learned from data. Therefore we can automatically obtain a mixed-dimension embedding-scheme by pruning redundant parameters for each feature. PEP is a general framework that can plug in various base recommendation models. Extensive experiments demonstrate it can efficiently cut down embedding parameters and boost the base model's performance. Specifically, it achieves strong recommendation performance while reducing 97-99% parameters. As for the computation cost, PEP only brings an additional 20-30% time cost compare with base models. ",
+    "title": "Learnable Embedding sizes for Recommender Systems",
+    "authors": [
+      "Siyi Liu",
+      "Chen Gao",
+      "Yihong Chen",
+      "Depeng Jin",
+      "Yong Li"
+    ],
+    "emails": [
+      "~Yong_Li3",
+      "~Yihong_Chen3",
+      "~Siyi_Liu1",
+      "tsinghua.edu.cn",
+      "~Chen_Gao3"
+    ],
+    "rank": 312
+  },
+  {
+    "url": "https://openreview.net/forum?id=eLfqMl3z3lq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      3,
+      3,
+      2
+    ],
+    "abstract": "Denoising Score Matching with Annealed Langevin Sampling (DSM-ALS) has recently found success in generative modeling. The approach works by first training a neural network to estimate the score of a distribution, and then using Langevin dynamics to sample from the data distribution assumed by the score network. Despite the convincing visual quality of samples, this method appears to perform worse than Generative Adversarial Networks (GANs) under the Fr\u00e9chet Inception Distance, a standard metric for generative models. We show that this apparent gap vanishes when denoising the final Langevin samples using the score network.\nIn addition, we propose two improvements to DSM-ALS:  1) Consistent Annealed Sampling as a more stable alternative to Annealed Langevin Sampling, and 2) a hybrid training formulation, composed of both Denoising Score Matching and adversarial objectives. By combining these two techniques and exploring different network architectures, we elevate score matching methods and obtain results competitive with state-of-the-art image generation on CIFAR-10.",
+    "title": "Adversarial score matching and improved sampling for image generation",
+    "authors": [
+      "Alexia Jolicoeur-Martineau",
+      "R\u00e9mi Pich\u00e9-Taillefer",
+      "Ioannis Mitliagkas",
+      "Remi Tachet des Combes"
+    ],
+    "emails": [
+      "~Alexia_Jolicoeur-Martineau1",
+      "umontreal.ca",
+      "~Ioannis_Mitliagkas1",
+      "~Remi_Tachet_des_Combes1"
+    ],
+    "rank": 313
+  },
+  {
+    "url": "https://openreview.net/forum?id=FmMKSO4e8JK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this work we consider data-driven optimization problems where one must maximize a function given only queries at a fixed set of points. This problem setting emerges in many domains where function evaluation is a complex and expensive process, such as in the design of materials, vehicles, or neural network architectures. Because the available data typically only covers a small manifold of the possible space of inputs, a principal challenge is to be able to construct algorithms that can reason about uncertainty and out-of-distribution values, since a naive optimizer can easily exploit an estimated model to return adversarial inputs. We propose to tackle the MBO problem by leveraging the normalized maximum-likelihood (NML) estimator, which provides a principled approach to handling uncertainty and out-of-distribution inputs. While in the standard formulation NML is intractable, we propose a tractable approximation that allows us to scale our method to high-capacity neural network models. We demonstrate that our method can effectively optimize high-dimensional design problems in a variety of disciplines such as chemistry, biology, and materials engineering.",
+    "title": "Offline Model-Based Optimization via Normalized Maximum Likelihood Estimation",
+    "authors": [
+      "Justin Fu",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Sergey_Levine1",
+      "~Justin_Fu1"
+    ],
+    "rank": 314
+  },
+  {
+    "url": "https://openreview.net/forum?id=pzpytjk3Xb2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "To craft black-box adversarial examples, adversaries need to query the victim model and take proper advantage of its feedback. Existing black-box attacks generally suffer from high query complexity, especially when only the top-1 decision (i.e., the hard-label prediction) of the victim model is available.  In this paper, we propose a novel hard-label black-box attack named Policy-Driven Attack, to reduce the query complexity. Our core idea is to learn promising search directions of the adversarial examples using a well-designed policy network in a novel reinforcement learning formulation, in which the queries become more sensible. Experimental results demonstrate that our method can significantly reduce the query complexity in comparison with existing state-of-the-art hard-label black-box attacks on various image classification benchmark datasets. Code and models for reproducing our results are available at <a href=\"https://github.com/ZiangYan/pda.pytorch\" target=\"_blank\" rel=\"nofollow\">https://github.com/ZiangYan/pda.pytorch</a>",
+    "title": "Policy-Driven Attack: Learning to Query for Hard-label Black-box Adversarial Examples",
+    "authors": [
+      "Ziang Yan",
+      "Yiwen Guo",
+      "Jian Liang",
+      "Changshui Zhang"
+    ],
+    "emails": [
+      "~Jian_Liang3",
+      "~Ziang_Yan1",
+      "~Yiwen_Guo1",
+      "~Changshui_Zhang1"
+    ],
+    "rank": 315
+  },
+  {
+    "url": "https://openreview.net/forum?id=4c0J6lwQ4_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Irregular sampling occurs in many time series modeling applications where it presents a significant challenge to standard deep learning models. This work is motivated by the analysis of physiological time series data in electronic health records, which are sparse, irregularly sampled, and multivariate. In this paper, we propose a new deep learning framework for this setting that we call Multi-Time Attention Networks. Multi-Time Attention Networks learn an embedding of continuous time values and use an attention mechanism to produce a fixed-length representation of a time series containing a variable number of observations. We investigate the performance of this framework on interpolation and classification tasks using multiple datasets. Our results show that the proposed approach performs as well or better than a range of baseline and recently proposed models while offering significantly faster training times than current state-of-the-art methods.",
+    "title": "Multi-Time Attention Networks for Irregularly Sampled Time Series",
+    "authors": [
+      "Satya Narayan Shukla",
+      "Benjamin Marlin"
+    ],
+    "emails": [
+      "~Satya_Narayan_Shukla1",
+      "~Benjamin_Marlin1"
+    ],
+    "rank": 316
+  },
+  {
+    "url": "https://openreview.net/forum?id=WznmQa42ZAx",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) have become a popular approach to integrating structural inductive biases into NLP models. However, there has been little work on interpreting them, and specifically on understanding which parts of the graphs (e.g. syntactic trees or co-reference structures) contribute to a prediction. In this work, we introduce a post-hoc method for interpreting the predictions of GNNs which identifies unnecessary edges. Given a trained GNN model, we learn a simple classifier that, for every edge in every layer, predicts if that edge can be dropped. We demonstrate that such a classifier can be trained in a fully differentiable fashion, employing stochastic gates and encouraging sparsity through the expected <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> norm. We use our technique as an attribution method to analyze GNN models for two tasks -- question answering and semantic role labeling -- providing insights into the information flow in these models. We show that we can drop a large proportion of edges without deteriorating the performance of the model, while we can analyse the remaining edges for interpreting model predictions.",
+    "title": "Interpreting Graph Neural Networks for NLP With Differentiable Edge Masking",
+    "authors": [
+      "Michael Sejr Schlichtkrull",
+      "Nicola De Cao",
+      "Ivan Titov"
+    ],
+    "emails": [
+      "~Ivan_Titov1",
+      "~Nicola_De_Cao1",
+      "~Michael_Sejr_Schlichtkrull1"
+    ],
+    "rank": 317
+  },
+  {
+    "url": "https://openreview.net/forum?id=dx11_7vm5_r",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Optimistic Gradient Descent Ascent (OGDA) and Optimistic Multiplicative Weights Update (OMWU) for saddle-point optimization have received growing attention due to their favorable last-iterate convergence. However, their behaviors for simple bilinear games over the probability simplex are still not fully understood --- previous analysis lacks explicit convergence rates, only applies to an exponentially small learning rate, or requires additional assumptions such as the uniqueness of the optimal solution.\n\nIn this work, we significantly expand the understanding of last-iterate convergence for OGDA and OMWU in the constrained setting. Specifically, for OMWU in bilinear games over the simplex, we show that when the equilibrium is unique, linear last-iterate convergence is achievable with a constant learning rate, which improves the result of (Daskalakis &amp; Panageas, 2019) under the same assumption. We then significantly extend the results to more general objectives and feasible sets for the projected OGDA algorithm, by introducing a sufficient condition under which OGDA exhibits concrete last-iterate convergence rates with a constant learning rate. We show that bilinear games over any polytope satisfy this condition and OGDA converges exponentially fast even without the unique equilibrium assumption. Our condition also holds for strongly-convex-strongly-concave functions, recovering the result of (Hsieh et al., 2019). Finally, we provide experimental results to further support our theory. ",
+    "title": "Linear Last-iterate Convergence in Constrained Saddle-point Optimization",
+    "authors": [
+      "Chen-Yu Wei",
+      "Chung-Wei Lee",
+      "Mengxiao Zhang",
+      "Haipeng Luo"
+    ],
+    "emails": [
+      "~Chung-Wei_Lee1",
+      "~Chen-Yu_Wei1",
+      "~Mengxiao_Zhang2",
+      "~Haipeng_Luo1"
+    ],
+    "rank": 318
+  },
+  {
+    "url": "https://openreview.net/forum?id=WesiCoRVQ15",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      2,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We show when maximizing a properly defined <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>-divergence measure with respect to a classifier's predictions and the supervised labels is robust with label noise. Leveraging its variational form, we derive a nice decoupling property for a family of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>-divergence measures when label noise presents, where the divergence is shown to be a linear combination of the variational difference defined on the clean distribution and a bias term introduced due to the noise. The above derivation helps us analyze the robustness of different <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>-divergence functions. With established robustness, this family of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>-divergence functions arises as useful metrics for the problem of learning with noisy labels, which do not require the specification of the labels' noise rate. When they are possibly not robust, we propose fixes to make them so. In addition to the analytical results, we present thorough experimental evidence. Our code is available at <a href=\"https://github.com/UCSC-REAL/Robust-f-divergence-measures\" target=\"_blank\" rel=\"nofollow\">https://github.com/UCSC-REAL/Robust-f-divergence-measures</a>.",
+    "title": "When Optimizing  f-Divergence is Robust with Label Noise",
+    "authors": [
+      "Jiaheng Wei",
+      "Yang Liu"
+    ],
+    "emails": [
+      "~Yang_Liu3",
+      "~Jiaheng_Wei1"
+    ],
+    "rank": 319
+  },
+  {
+    "url": "https://openreview.net/forum?id=gwnoVHIES05",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Sketching or doodling is a popular creative activity that people engage in. However, most existing work in automatic sketch understanding or generation has focused on sketches that are quite mundane. In this work, we introduce two datasets of creative sketches -- Creative Birds and Creative Creatures -- containing 10k sketches each along with part annotations. We propose DoodlerGAN -- a part-based Generative Adversarial Network (GAN) -- to generate unseen compositions of novel part appearances. Quantitative evaluations as well as human studies demonstrate that sketches generated by our approach are more creative and of higher quality than existing approaches. In fact, in Creative Birds, subjects prefer sketches generated by DoodlerGAN over those drawn by humans!",
+    "title": "Creative Sketch Generation",
+    "authors": [
+      "Songwei Ge",
+      "Vedanuj Goswami",
+      "Larry Zitnick",
+      "Devi Parikh"
+    ],
+    "emails": [
+      "~Devi_Parikh1",
+      "~Vedanuj_Goswami1",
+      "~Larry_Zitnick1",
+      "~Songwei_Ge2"
+    ],
+    "rank": 320
+  },
+  {
+    "url": "https://openreview.net/forum?id=OGg9XnKxFAH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recent approaches to efficiently ensemble neural networks have shown that strong robustness and uncertainty performance  can be achieved with a negligible gain in parameters over the original network. However, these methods still require multiple forward passes for prediction, leading to a significant runtime cost. In this work, we show a surprising result:\nthe benefits of using multiple predictions can be achieved 'for free' under a single model's forward pass. In particular, we show that, using a multi-input multi-output (MIMO) configuration, one can utilize a single model's capacity to train multiple subnetworks that independently learn the task at hand. By ensembling the predictions made by the subnetworks, we improve model robustness without increasing compute. We observe a significant improvement in negative log-likelihood, accuracy, and calibration error on CIFAR10, CIFAR100,  ImageNet, and their out-of-distribution variants compared to previous methods.",
+    "title": "Training independent subnetworks for robust prediction",
+    "authors": [
+      "Marton Havasi",
+      "Rodolphe Jenatton",
+      "Stanislav Fort",
+      "Jeremiah Zhe Liu",
+      "Jasper Snoek",
+      "Balaji Lakshminarayanan",
+      "Andrew Mingbo Dai",
+      "Dustin Tran"
+    ],
+    "emails": [
+      "~Marton_Havasi1",
+      "~Stanislav_Fort1",
+      "~Dustin_Tran1",
+      "~Balaji_Lakshminarayanan1",
+      "~Jeremiah_Zhe_Liu1",
+      "~Jasper_Snoek1",
+      "~Rodolphe_Jenatton3",
+      "~Andrew_Mingbo_Dai1"
+    ],
+    "rank": 321
+  },
+  {
+    "url": "https://openreview.net/forum?id=OMizHuea_HB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Contrastive learning has been shown to produce generalizable representations of audio and visual data by maximizing the lower bound on the mutual information (MI) between different views of an instance. However, obtaining a tight lower bound requires a sample size exponential in MI and thus a large set of negative samples. We can incorporate more samples by building a large queue-based dictionary, but there are theoretical limits to performance improvements even with a large number of negative samples. We hypothesize that random negative sampling leads to a highly redundant dictionary that results in suboptimal representations for downstream tasks. In this paper, we propose an active contrastive learning approach that builds an actively sampled dictionary with diverse and informative items, which improves the quality of negative samples and improves performances on tasks where there is high mutual information in the data, e.g., video classification. Our model achieves state-of-the-art performance on challenging audio and visual downstream benchmarks including UCF101, HMDB51 and ESC50. ",
+    "title": "Active Contrastive Learning of Audio-Visual Video Representations",
+    "authors": [
+      "Shuang Ma",
+      "Zhaoyang Zeng",
+      "Daniel McDuff",
+      "Yale Song"
+    ],
+    "emails": [
+      "~Shuang_Ma3",
+      "~Yale_Song1",
+      "~Zhaoyang_Zeng1",
+      "~Daniel_McDuff1"
+    ],
+    "rank": 322
+  },
+  {
+    "url": "https://openreview.net/forum?id=t0TaKv0Gx6Z",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Kernelized Stein discrepancy (KSD), though being extensively used in goodness-of-fit tests and model learning, suffers from the curse-of-dimensionality. We address this issue by proposing the sliced Stein discrepancy and its scalable and kernelized variants, which employs kernel-based test functions defined on the optimal one-dimensional projections. When applied to goodness-of-fit tests, extensive experiments show the proposed discrepancy significantly outperforms KSD and various baselines in high dimensions. For model learning, we show its advantages by training an independent component analysis when compared with existing Stein discrepancy baselines. We further propose a novel particle inference method called sliced Stein variational gradient descent (S-SVGD) which alleviates the mode-collapse issue of SVGD in training variational autoencoders.",
+    "title": "Sliced Kernelized Stein Discrepancy",
+    "authors": [
+      "Wenbo Gong",
+      "Yingzhen Li",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato"
+    ],
+    "emails": [
+      "~Wenbo_Gong1",
+      "~Yingzhen_Li1",
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1"
+    ],
+    "rank": 323
+  },
+  {
+    "url": "https://openreview.net/forum?id=yvQKLaqNE6M",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Despite their recent successes, GAN models for semantic image synthesis still suffer from poor image quality when trained with only adversarial supervision. Historically, additionally employing the VGG-based perceptual loss has helped to overcome this issue, significantly improving the synthesis quality, but at the same time limiting the progress of GAN models for semantic image synthesis. In this work, we propose a novel, simplified GAN model, which needs only adversarial supervision to achieve high quality results. We re-design the discriminator as a semantic segmentation network, directly using the given semantic label maps as the ground truth for training. By providing stronger supervision to the discriminator as well as to the generator through spatially- and semantically-aware discriminator feedback, we are able to synthesize images of higher fidelity with better alignment to their input label maps, making the use of the perceptual loss superfluous. Moreover, we enable high-quality multi-modal image synthesis through global and local sampling of a 3D noise tensor injected into the generator, which allows complete or partial image change. We show that images synthesized by our model are more diverse and follow the color and texture distributions of real images more closely. We achieve an average improvement of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>6</mn></math></mjx-assistive-mml></mjx-container> FID and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>5</mn></math></mjx-assistive-mml></mjx-container> mIoU points over the state of the art across different datasets using only adversarial supervision.",
+    "title": "You Only Need Adversarial Supervision for Semantic Image Synthesis",
+    "authors": [
+      "Edgar Sch\u00f6nfeld",
+      "Vadim Sushko",
+      "Dan Zhang",
+      "Juergen Gall",
+      "Bernt Schiele",
+      "Anna Khoreva"
+    ],
+    "emails": [
+      "~Vadim_Sushko1",
+      "~Bernt_Schiele1",
+      "~Dan_Zhang1",
+      "~Juergen_Gall1",
+      "~Edgar_Sch\u00f6nfeld1",
+      "~Anna_Khoreva1"
+    ],
+    "rank": 324
+  },
+  {
+    "url": "https://openreview.net/forum?id=8xeBUgD8u9",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "While a diverse collection of continual learning (CL) methods has been proposed to prevent catastrophic forgetting, a thorough investigation of their effectiveness for processing sequential data with recurrent neural networks (RNNs) is lacking. Here, we provide the first comprehensive evaluation of established CL methods on a variety of sequential data benchmarks. Specifically, we shed light on the particularities that arise when applying weight-importance methods, such as elastic weight consolidation, to RNNs. In contrast to feedforward networks, RNNs iteratively reuse a shared set of weights and require working memory to process input samples. We show that the performance of weight-importance methods is not directly affected by the length of the processed sequences, but rather by high working memory requirements, which lead to an increased need for stability at the cost of decreased plasticity for learning subsequent tasks. We additionally provide theoretical arguments supporting this interpretation by studying linear RNNs. Our study shows that established CL methods can be successfully ported to the recurrent case, and that a recent regularization approach based on hypernetworks outperforms weight-importance methods, thus emerging as a promising candidate for CL in RNNs. Overall, we provide insights on the differences between CL in feedforward networks and RNNs, while guiding towards effective solutions to tackle CL on sequential data.",
+    "title": "Continual learning in recurrent neural networks",
+    "authors": [
+      "Benjamin Ehret",
+      "Christian Henning",
+      "Maria Cervera",
+      "Alexander Meulemans",
+      "Johannes Von Oswald",
+      "Benjamin F Grewe"
+    ],
+    "emails": [
+      "~Maria_Cervera1",
+      "ethz.ch",
+      "~Johannes_Von_Oswald1",
+      "~Alexander_Meulemans1",
+      "~Benjamin_F_Grewe1",
+      "~Christian_Henning1"
+    ],
+    "rank": 325
+  },
+  {
+    "url": "https://openreview.net/forum?id=sSjqmfsk95O",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      8,
+      4,
+      8,
+      7
+    ],
+    "rating": "6.72",
+    "confidences": [
+      4,
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Numerous task-specific variants of conditional generative adversarial networks have been developed for image completion. Yet, a serious limitation remains that all existing algorithms tend to fail when handling large-scale missing regions. To overcome this challenge, we propose a generic new approach that bridges the gap between image-conditional and recent modulated unconditional generative architectures via co-modulation of both conditional and stochastic style representations. Also, due to the lack of good quantitative metrics for image completion, we propose the new Paired/Unpaired Inception Discriminative Score (P-IDS/U-IDS), which robustly measures the perceptual fidelity of inpainted images compared to real images via linear separability in a feature space. Experiments demonstrate superior performance in terms of both quality and diversity over state-of-the-art methods in free-form image completion and easy generalization to image-to-image translation. Code is available at <a href=\"https://github.com/zsyzzsoft/co-mod-gan\" target=\"_blank\" rel=\"nofollow\">https://github.com/zsyzzsoft/co-mod-gan</a>.",
+    "title": "Large Scale Image Completion via Co-Modulated Generative Adversarial Networks",
+    "authors": [
+      "Shengyu Zhao",
+      "Jonathan Cui",
+      "Yilun Sheng",
+      "Yue Dong",
+      "Xiao Liang",
+      "Eric I-Chao Chang",
+      "Yan Xu"
+    ],
+    "emails": [
+      "qq.com",
+      "rdfz.cn",
+      "microsoft.com",
+      "~Jonathan_Cui1",
+      "gmail.com",
+      "~Yan_Xu2",
+      "~Shengyu_Zhao1"
+    ],
+    "rank": 326
+  },
+  {
+    "url": "https://openreview.net/forum?id=0oabwyZbOu",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      9,
+      8,
+      5
+    ],
+    "rating": "6.72",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Intelligent agents need to generalize from past experience to achieve goals in complex environments. World models facilitate such generalization and allow learning behaviors from imagined outcomes to increase sample-efficiency. While learning world models from image inputs has recently become feasible for some tasks, modeling Atari games accurately enough to derive successful behaviors has remained an open challenge for many years. We introduce DreamerV2, a reinforcement learning agent that learns behaviors purely from predictions in the compact latent space of a powerful world model. The world model uses discrete representations and is trained separately from the policy. DreamerV2 constitutes the first agent that achieves human-level performance on the Atari benchmark of 55 tasks by learning behaviors inside a separately trained world model. With the same computational budget and wall-clock time, Dreamer V2 reaches 200M frames and surpasses the final performance of the top single-GPU agents IQN and Rainbow. DreamerV2 is also applicable to tasks with continuous actions, where it learns an accurate world model of a complex humanoid robot and solves stand-up and walking from only pixel inputs.",
+    "title": "Mastering Atari with Discrete World Models",
+    "authors": [
+      "Danijar Hafner",
+      "Timothy P Lillicrap",
+      "Mohammad Norouzi",
+      "Jimmy Ba"
+    ],
+    "emails": [
+      "~Jimmy_Ba1",
+      "~Danijar_Hafner1",
+      "~Timothy_P_Lillicrap1",
+      "~Mohammad_Norouzi1"
+    ],
+    "rank": 327
+  },
+  {
+    "url": "https://openreview.net/forum?id=XSLF1XFq5h",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.71",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Both uncertainty estimation and interpretability are important factors for trustworthy machine learning systems. However, there is little work at the intersection of these two areas. We address this gap by proposing a novel method for interpreting uncertainty estimates from differentiable probabilistic models, like Bayesian Neural Networks (BNNs). Our method, Counterfactual Latent Uncertainty Explanations (CLUE), indicates how to change an input, while keeping it on the data manifold, such that a BNN becomes more confident about the input's prediction. We validate CLUE through 1) a novel framework for evaluating counterfactual explanations of uncertainty, 2) a series of ablation experiments, and 3) a user study. Our experiments show that CLUE outperforms baselines and enables practitioners to better understand which input patterns are responsible for predictive uncertainty.",
+    "title": "Getting a CLUE: A  Method for Explaining Uncertainty Estimates",
+    "authors": [
+      "Javier Antoran",
+      "Umang Bhatt",
+      "Tameem Adel",
+      "Adrian Weller",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato"
+    ],
+    "emails": [
+      "~Tameem_Adel1",
+      "~Adrian_Weller1",
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1",
+      "~Javier_Antoran1",
+      "~Umang_Bhatt1"
+    ],
+    "rank": 328
+  },
+  {
+    "url": "https://openreview.net/forum?id=BM---bH_RSh",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.71",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "The recommendation system (RS) plays an important role in the content recommendation and retrieval scenarios. The core part of the system is the Ranking neural network, which is usually a bottleneck of whole system performance during online inference.  In this work, we propose a unified model and embedding compression (UMEC) framework to hammer an efficient neural network-based recommendation system.  Our framework jointly learns input feature selection and neural network compression together, and solve them as an end-to-end resource-constrained optimization problem using ADMM.  Our method outperforms other baselines in terms of neural network Flops, sparse embedding feature size and the number of sparse embedding features.  We evaluate our method on the public benchmark of DLRM, trained over the Kaggle Criteo dataset. The codes can be found at <a href=\"https://github.com/VITA-Group/UMEC\" target=\"_blank\" rel=\"nofollow\">https://github.com/VITA-Group/UMEC</a>.",
+    "title": "UMEC: Unified model and embedding compression for efficient recommendation systems",
+    "authors": [
+      "Jiayi Shen",
+      "Haotao Wang",
+      "Shupeng Gui",
+      "Jianchao Tan",
+      "Zhangyang Wang",
+      "Ji Liu"
+    ],
+    "emails": [
+      "~Jianchao_Tan1",
+      "~Zhangyang_Wang1",
+      "~Jiayi_Shen1",
+      "~Shupeng_Gui1",
+      "~Haotao_Wang1",
+      "~Ji_Liu1"
+    ],
+    "rank": 329
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZTFeSBIX9C",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.71",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "Knowledge distillation (KD) is essential for training non-autoregressive translation (NAT) models by reducing the complexity of the raw data with an autoregressive teacher model. In this study, we empirically show that as a side effect of this training, the lexical choice errors on low-frequency words are propagated to the NAT model from the teacher model. To alleviate this problem, we propose to expose the raw data to NAT models to restore the useful information of low-frequency words, which are missed in the distilled data. To this end, we introduce an extra Kullback-Leibler divergence term derived by comparing the lexical choice of NAT model and that embedded in the raw data. Experimental results across language pairs and model architectures demonstrate the effectiveness and universality of the proposed approach.  Extensive analyses confirm our claim that our approach improves performance by reducing the lexical choice errors on low-frequency words.  Encouragingly, our approach pushes the SOTA NAT performance on the WMT14 English-German and WMT16 Romanian-English datasets up to 27.8 and 33.8 BLEU points, respectively. ",
+    "title": "Understanding and Improving Lexical Choice in Non-Autoregressive Translation",
+    "authors": [
+      "Liang Ding",
+      "Longyue Wang",
+      "Xuebo Liu",
+      "Derek F. Wong",
+      "Dacheng Tao",
+      "Zhaopeng Tu"
+    ],
+    "emails": [
+      "~Liang_Ding3",
+      "~Longyue_Wang3",
+      "~Dacheng_Tao1",
+      "~Xuebo_Liu1",
+      "~Derek_F._Wong1",
+      "~Zhaopeng_Tu1"
+    ],
+    "rank": 330
+  },
+  {
+    "url": "https://openreview.net/forum?id=3uiR9bkbDjL",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      9,
+      6,
+      6
+    ],
+    "rating": "6.71",
+    "confidences": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Most real-world graphs are dynamic and eventually face the cold start problem. A fundamental question is how the new cold nodes acquire initial information in order to be adapted into the existing graph. Here  we postulates the cold start problem as a fundamental issue in graph learning and propose a new learning setting, \"``Expanded Semi-supervised Learning.\" In expanded semi-supervised learning we extend the original semi-supervised learning setting even to new cold nodes that are disconnected from the graph. To this end, we propose ColdExpand model that classifies the cold nodes based on link prediction with multiple goals to tackle. We experimentally prove that by adding additional goal to existing link prediction method,  our method outperforms the baseline in both expanded semi-supervised link prediction (at most 24\\%) and node classification tasks (at most 15%). To the best of our knowledge this is the first study to address expansion of semi-supervised learning to unseen nodes.",
+    "title": "ColdExpand: Semi-Supervised Graph Learning in Cold Start",
+    "authors": [
+      "Il-Jae Kwon",
+      "Kyoung-Woon On",
+      "Dong-Geon Lee",
+      "Byoung-Tak Zhang"
+    ],
+    "emails": [
+      "~Byoung-Tak_Zhang1",
+      "~Il-Jae_Kwon1",
+      "~Kyoung-Woon_On1",
+      "~Dong-Geon_Lee1"
+    ],
+    "rank": 331
+  },
+  {
+    "url": "https://openreview.net/forum?id=uCQfPZwRaUu",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.71",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "While deep reinforcement learning excels at solving tasks where large amounts of data can be collected through virtually unlimited interaction with the environment, learning from limited interaction remains a key challenge. We posit that an agent can learn more efficiently if we augment reward maximization with self-supervised objectives based on structure in its visual input and sequential interaction with the environment.  Our method, Self-Predictive Representations (SPR), trains an agent to predict its own latent state representations multiple steps into the future. We compute target representations for future states using an encoder which is an exponential moving average of the agent\u2019s parameters and we make predictions using a learned transition model.  On its own,  this future prediction objective outperforms prior methods for sample-efficient deep RL from pixels. We further improve performance by adding data augmentation to the future prediction loss, which forces the agent\u2019s representations to be consistent across multiple views of an observation.  Our full self-supervised objective, which combines future prediction and data augmentation, achieves a median human-normalized score of 0.415 on Atari in a setting limited to 100k steps of environment interaction, which represents a 55% relative improvement over the previous state-of-the-art. Notably, even in this limited data regime, SPR exceeds expert human scores on 7 out of 26 games. We\u2019ve made the code associated with this work available at <a href=\"https://github.com/mila-iqia/spr\" target=\"_blank\" rel=\"nofollow\">https://github.com/mila-iqia/spr</a>.",
+    "title": "Data-Efficient Reinforcement Learning with Self-Predictive Representations",
+    "authors": [
+      "Max Schwarzer",
+      "Ankesh Anand",
+      "Rishab Goel",
+      "R Devon Hjelm",
+      "Aaron Courville",
+      "Philip Bachman"
+    ],
+    "emails": [
+      "~Philip_Bachman1",
+      "~Max_Schwarzer1",
+      "~Aaron_Courville3",
+      "~R_Devon_Hjelm1",
+      "~Rishab_Goel3",
+      "~Ankesh_Anand1"
+    ],
+    "rank": 332
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZDnzZrTqU9N",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Distributionally robust optimization (DRO) provides a framework for training machine learning models that are able to perform well on a collection of related data distributions (the \"uncertainty set\"). This is done by solving a min-max game: the model is trained to minimize its maximum expected loss among all distributions in the uncertainty set. While careful design of the uncertainty set is critical to the success of the DRO procedure, previous work has been limited to relatively simple alternatives that keep the min-max optimization problem exactly tractable, such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>-divergence balls. In this paper, we argue instead for the use of neural generative models to characterize the worst-case distribution, allowing for more flexible and problem-specific selection of the uncertainty set. However, while simple conceptually, this approach poses a number of implementation and optimization challenges. To circumvent these issues, we propose a relaxation of the KL-constrained inner maximization objective that makes the DRO problem more amenable to gradient-based optimization of large scale generative models, and develop model selection heuristics to guide hyper-parameter search. On both toy settings and realistic NLP tasks, we find that the proposed approach yields models that are more robust than comparable baselines.",
+    "title": "Modeling the Second Player in Distributionally Robust Optimization",
+    "authors": [
+      "Paul Michel",
+      "Tatsunori Hashimoto",
+      "Graham Neubig"
+    ],
+    "emails": [
+      "~Tatsunori_Hashimoto1",
+      "~Paul_Michel1",
+      "~Graham_Neubig1"
+    ],
+    "rank": 333
+  },
+  {
+    "url": "https://openreview.net/forum?id=2AL06y9cDE-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Despite their success in massive engineering applications, deep neural networks are vulnerable to various perturbations due to their black-box nature. Recent study has shown that a deep neural network can misclassify the data even if the input data is perturbed by an imperceptible amount. In this paper, we address the robustness issue of neural networks by a novel close-loop control method from the perspective of dynamic systems. Instead of modifying the parameters in a fixed neural network architecture, a close-loop control process is added to generate control signals adaptively for the perturbed or corrupted data. We connect the robustness of neural networks with optimal control using the geometrical information of underlying data to design the control objective. The detailed analysis shows how the embedding manifolds of state trajectory affect error estimation of the proposed method. Our approach can simultaneously maintain the performance on clean data and improve the robustness against many types of data perturbations. It can also further improve the performance of robustly trained neural networks against different perturbations. To the best of our knowledge, this is the first work that improves the robustness of neural networks with close-loop control.",
+    "title": "Towards Robust Neural Networks via Close-loop Control",
+    "authors": [
+      "Zhuotong Chen",
+      "Qianxiao Li",
+      "Zheng Zhang"
+    ],
+    "emails": [
+      "~Qianxiao_Li1",
+      "~Zhuotong_Chen1",
+      "~Zheng_Zhang2"
+    ],
+    "rank": 334
+  },
+  {
+    "url": "https://openreview.net/forum?id=QubpWYfdNry",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.71",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Human beings are able to understand objectives and learn by simply observing others perform a task. Imitation learning methods aim to replicate such capabilities, however, they generally depend on access to a full set of optimal states and actions taken with the agent's actuators and from the agent's point of view. In this paper, we introduce a new algorithm - called Disentangling Generative Adversarial Imitation Learning (DisentanGAIL) - with the purpose of bypassing such constraints. Our algorithm enables autonomous agents to learn directly from high dimensional observations of an expert performing a task, by making use of adversarial learning with a latent representation inside the discriminator network. Such latent representation is regularized through mutual information constraints to incentivize learning only features that encode information about the completion levels of the task being demonstrated. This allows to obtain a shared feature space to successfully perform imitation while disregarding the differences between the expert's and the agent's domains. Empirically, our algorithm is able to efficiently imitate in a diverse range of control problems including balancing, manipulation and locomotive tasks, while being robust to various domain differences in terms of both environment appearance and agent embodiment.",
+    "title": "Domain-Robust Visual Imitation Learning with Mutual Information Constraints",
+    "authors": [
+      "Edoardo Cetin",
+      "Oya Celiktutan"
+    ],
+    "emails": [
+      "~Edoardo_Cetin1",
+      "~Oya_Celiktutan2"
+    ],
+    "rank": 335
+  },
+  {
+    "url": "https://openreview.net/forum?id=ldxlzGYWDmW",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      5
+    ],
+    "rating": "6.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "As a step towards improving the abstract reasoning capability of machines, we aim to solve Raven\u2019s Progressive Matrices (RPM) with neural networks, since solving RPM puzzles is highly correlated with human intelligence. Unlike previous methods that use auxiliary annotations or assume hidden rules to produce appropriate feature representation, we only use the ground truth answer of each question for model learning,  aiming for an intelligent agent to have a strong learning capability with a small amount of supervision.  Based on the RPM problem formulation,  the correct answer filled into the missing entry of the third row/column has  to  best  satisfy  the  same  rules  shared  between  the  first  two  rows/columns.Thus  we  design  a  simple  yet  effective  Dual-Contrast  Network  (DCNet)  to  exploit the inherent structure of RPM puzzles.  Specifically, a rule contrast module is  designed  to  compare  the  latent  rules  between  the  filled  row/column  and  the first two rows/columns; a choice contrast module is designed to increase the relative differences between candidate choices.  Experimental results on the RAVEN and  PGM  datasets  show  that  DCNet  outperforms  the  state-of-the-art  methods by a large margin of 5.77%.   Further experiments on few training samples and model generalization also show the effectiveness of DCNet.  Code is available at <a href=\"https://github.com/visiontao/dcnet\" target=\"_blank\" rel=\"nofollow\">https://github.com/visiontao/dcnet</a>.",
+    "title": "Effective Abstract Reasoning with Dual-Contrast Network",
+    "authors": [
+      "Tao Zhuo",
+      "Mohan Kankanhalli"
+    ],
+    "emails": [
+      "~Mohan_Kankanhalli1",
+      "~Tao_Zhuo3"
+    ],
+    "rank": 336
+  },
+  {
+    "url": "https://openreview.net/forum?id=onxoVA9FxMw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.71",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Various Position Embeddings (PEs) have been proposed in Transformer based architectures~(e.g. BERT) to model word order. These are empirically-driven and perform well, but no formal framework exists to systematically study them. To address this, we present three properties of PEs that capture word distance in vector space:  translation invariance, monotonicity, and  symmetry. These properties formally capture the behaviour of PEs and allow us to reinterpret sinusoidal PEs in a principled way.\nMoreover, we propose a new probing test (called `identical word probing') and  mathematical  indicators to quantitatively detect the general  attention patterns with respect to the above properties. An empirical evaluation of seven PEs (and their combinations) for classification (GLUE) and span prediction (SQuAD) shows that: (1) both  classification and span prediction benefit from  translation invariance and local monotonicity, while symmetry slightly decreases performance;\n(2) The fully-learnable absolute PE performs better in classification, while relative PEs perform better in span prediction.  We contribute the first formal and quantitative analysis of desiderata for PEs, and  a principled discussion about their correlation to the performance of typical downstream tasks.",
+    "title": "On Position Embeddings in BERT",
+    "authors": [
+      "Benyou Wang",
+      "Lifeng Shang",
+      "Christina Lioma",
+      "Xin Jiang",
+      "Hao Yang",
+      "Qun Liu",
+      "Jakob Grue Simonsen"
+    ],
+    "emails": [
+      "~Qun_Liu1",
+      "~Benyou_Wang2",
+      "~Xin_Jiang1",
+      "~Christina_Lioma1",
+      "~Hao_Yang7",
+      "~Jakob_Grue_Simonsen1",
+      "~Lifeng_Shang1"
+    ],
+    "rank": 337
+  },
+  {
+    "url": "https://openreview.net/forum?id=TK_6nNb_C7q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.71",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Recent work by Marino et al. (2020) showed improved performance in sequential density estimation by combining masked autoregressive flows with hierarchical latent variable models. We draw a connection between such autoregressive generative models and the task of lossy video compression. Specifically, we view recent neural video compression methods (Lu et al., 2019; Yang et al., 2020b; Agustssonet al., 2020) as instances of a generalized stochastic temporal autoregressive transform, and propose avenues for enhancement based on this insight. Comprehensive evaluations on large-scale video data show improved rate-distortion performance over both state-of-the-art neural and conventional video compression methods.",
+    "title": "Hierarchical Autoregressive Modeling for Neural Video Compression",
+    "authors": [
+      "Ruihan Yang",
+      "Yibo Yang",
+      "Joseph Marino",
+      "Stephan Mandt"
+    ],
+    "emails": [
+      "~Ruihan_Yang1",
+      "~Yibo_Yang1",
+      "~Joseph_Marino1",
+      "~Stephan_Mandt1"
+    ],
+    "rank": 338
+  },
+  {
+    "url": "https://openreview.net/forum?id=9GsFOUyUPi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recent studies have shown that skeletonization (pruning parameters) of networks at initialization provides all the practical benefits of sparsity both at inference and training time, while only marginally degrading their performance. However, we observe that beyond a certain level of sparsity (approx 95%), these approaches fail to preserve the network performance, and to our surprise, in many cases perform even worse than trivial random pruning. To this end, we propose an objective to find a skeletonized network with maximum foresight connection sensitivity (FORCE) whereby the trainability, in terms of connection sensitivity, of a pruned network is taken into consideration. We then propose two approximate procedures to maximize our objective (1) Iterative SNIP: allows parameters that were unimportant at earlier stages of skeletonization to become important at later stages; and (2) FORCE: iterative process that allows exploration by allowing already pruned parameters to resurrect at later stages of skeletonization. Empirical analysis on a large suite of experiments show that our approach, while providing at least as good performance as other recent approaches on moderate pruning levels, provide remarkably improved performance on high pruning levels (could remove up to 99.5% parameters while keeping the networks trainable).",
+    "title": "Progressive Skeletonization: Trimming more fat from a network at initialization",
+    "authors": [
+      "Pau de Jorge",
+      "Amartya Sanyal",
+      "Harkirat Behl",
+      "Philip Torr",
+      "Gr\u00e9gory Rogez",
+      "Puneet K. Dokania"
+    ],
+    "emails": [
+      "~Gr\u00e9gory_Rogez1",
+      "~Amartya_Sanyal1",
+      "~Harkirat_Behl1",
+      "~Pau_de_Jorge1",
+      "~Puneet_K._Dokania1",
+      "~Philip_Torr1"
+    ],
+    "rank": 339
+  },
+  {
+    "url": "https://openreview.net/forum?id=whE31dn74cL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.70",
+    "confidences": [
+      3,
+      3,
+      2,
+      2
+    ],
+    "abstract": "Sequential deep learning models such as RNN, causal CNN and attention mechanism do not readily consume continuous-time information. Discretizing the temporal data, as we show, causes inconsistency even for simple continuous-time processes. Current approaches often handle time in a heuristic manner to be consistent with the existing deep learning architectures and implementations. In this paper, we provide a principled way to characterize continuous-time systems using deep learning tools. Notably, the proposed approach applies to all the major deep learning architectures and requires little modifications to the implementation. The critical insight is to represent the continuous-time system by composing neural networks with a temporal kernel, where we gain our intuition from the recent advancements in understanding deep learning with Gaussian process and neural tangent kernel. To represent the temporal kernel, we introduce the random feature approach and convert the kernel learning problem to spectral density estimation under reparameterization. We further prove the convergence and consistency results even when the temporal kernel is non-stationary, and the spectral density is misspecified. The simulations and real-data experiments demonstrate the empirical effectiveness of our temporal kernel approach in a broad range of settings.",
+    "title": "A Temporal Kernel Approach for Deep Learning with Continuous-time Information",
+    "authors": [
+      "Da Xu",
+      "Chuanwei Ruan",
+      "Evren Korpeoglu",
+      "Sushant Kumar",
+      "Kannan Achan"
+    ],
+    "emails": [
+      "~Chuanwei_Ruan1",
+      "~Sushant_Kumar1",
+      "~Evren_Korpeoglu1",
+      "~Da_Xu2",
+      "~Kannan_Achan1"
+    ],
+    "rank": 340
+  },
+  {
+    "url": "https://openreview.net/forum?id=NX1He-aFO_F",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      8
+    ],
+    "rating": "6.70",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Policy gradient algorithms have proven to be successful in diverse decision making and control tasks. However, these methods suffer from high sample complexity and instability issues. In this paper, we address these challenges by providing a different approach for training the critic in the actor-critic framework. Our work builds on recent studies indicating that traditional actor-critic algorithms do not succeed in fitting the true value function, calling for the need to identify a better objective for the critic. In our method, the critic uses a new state-value (resp. state-action-value) function approximation that learns the value of the states (resp. state-action pairs) relative to their mean value rather than the absolute value as in conventional actor-critic. We prove the theoretical consistency of the new gradient estimator and observe dramatic empirical improvement across a variety of continuous control tasks and algorithms. Furthermore, we validate our method in tasks with sparse rewards, where we provide experimental evidence and theoretical insights.",
+    "title": "Learning Value Functions in Deep Policy Gradients using Residual Variance",
+    "authors": [
+      "Yannis Flet-Berliac",
+      "reda ouhamma",
+      "odalric-ambrym maillard",
+      "Philippe Preux"
+    ],
+    "emails": [
+      "~Philippe_Preux1",
+      "~reda_ouhamma1",
+      "~Yannis_Flet-Berliac1",
+      "~odalric-ambrym_maillard1"
+    ],
+    "rank": 341
+  },
+  {
+    "url": "https://openreview.net/forum?id=tL89RnzIiCd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.70",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "We introduce a modern Hopfield network with continuous states and a corresponding update rule. The new Hopfield network can store exponentially (with the dimension of the associative space) many patterns, retrieves the pattern with one update, and has exponentially small retrieval errors. It has three types of energy minima (fixed points of the update): (1) global fixed point averaging over all patterns, (2) metastable states averaging over a subset of patterns, and (3) fixed points which store a single pattern. The new update rule is equivalent to the attention mechanism used in transformers. This equivalence enables a characterization of the heads of transformer models. These heads perform in the first layers preferably global averaging and in higher layers partial averaging via metastable states. The new modern Hopfield network can be integrated  into deep learning architectures as layers to allow the storage of and access to raw input data, intermediate results, or learned prototypes.\nThese Hopfield layers enable new ways of deep learning, beyond fully-connected, convolutional, or recurrent networks, and provide pooling, memory, association, and attention mechanisms. We demonstrate the broad applicability of the Hopfield layers\nacross various domains. Hopfield layers improved state-of-the-art on three out of four considered multiple instance learning problems as well as on immune repertoire classification with several hundreds of thousands of instances. On the UCI benchmark collections of small classification tasks, where deep learning methods typically struggle, Hopfield layers yielded a new state-of-the-art when compared to different machine learning methods. Finally, Hopfield layers achieved state-of-the-art on two drug design datasets. The implementation is available at: \\url{<a href=\"https://github.com/ml-jku/hopfield-layers}\" target=\"_blank\" rel=\"nofollow\">https://github.com/ml-jku/hopfield-layers}</a>",
+    "title": "Hopfield Networks is All You Need",
+    "authors": [
+      "Hubert Ramsauer",
+      "Bernhard Sch\u00e4fl",
+      "Johannes Lehner",
+      "Philipp Seidl",
+      "Michael Widrich",
+      "Lukas Gruber",
+      "Markus Holzleitner",
+      "Thomas Adler",
+      "David Kreil",
+      "Michael K Kopp",
+      "G\u00fcnter Klambauer",
+      "Johannes Brandstetter",
+      "Sepp Hochreiter"
+    ],
+    "emails": [
+      "~Johannes_Lehner1",
+      "~Philipp_Seidl1",
+      "~Markus_Holzleitner1",
+      "~Lukas_Gruber2",
+      "kreil.org",
+      "~Bernhard_Sch\u00e4fl1",
+      "~Hubert_Ramsauer2",
+      "~Johannes_Brandstetter1",
+      "~G\u00fcnter_Klambauer1",
+      "~Michael_K_Kopp1",
+      "~Thomas_Adler1",
+      "~Michael_Widrich2",
+      "~Sepp_Hochreiter1"
+    ],
+    "rank": 342
+  },
+  {
+    "url": "https://openreview.net/forum?id=LXMSvPmsm0g",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      8
+    ],
+    "rating": "6.70",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "The lottery ticket hypothesis states that a highly sparsified sub-network can be trained in isolation, given the appropriate weight initialization. This paper extends that hypothesis from one-shot task learning, and demonstrates for the first time that such extremely compact and independently trainable sub-networks can be also identified in the lifelong learning scenario, which we call lifelong tickets. We show that the resulting lifelong ticket can further be leveraged to improve the performance of learning over continual tasks. However, it is highly non-trivial to conduct network pruning in the lifelong setting. Two critical roadblocks arise: i) As many tasks now arrive sequentially, finding tickets in a greedy weight pruning fashion will inevitably suffer from the intrinsic bias, that the earlier emerging tasks impact more; ii) As lifelong learning is consistently challenged by catastrophic forgetting, the compact network capacity of tickets might amplify the risk of forgetting. In view of those, we introduce two pruning options, e.g., top-down and bottom-up, for finding lifelong tickets. Compared to the top-down pruning that extends vanilla (iterative) pruning over sequential tasks, we show that the bottom-up one, which can dynamically shrink and (re-)expand model capacity, effectively avoids the undesirable excessive pruning in the early stage. We additionally introduce lottery teaching that further overcomes forgetting via knowledge distillation aided by external unlabeled data. Unifying those ingredients, we demonstrate the existence of very competitive lifelong tickets, e.g., achieving 3-8% of the dense model size with even higher accuracy, compared to strong class-incremental learning baselines on CIFAR-10/CIFAR-100/Tiny-ImageNet datasets. Codes available at <a href=\"https://github.com/VITA-Group/Lifelong-Learning-LTH\" target=\"_blank\" rel=\"nofollow\">https://github.com/VITA-Group/Lifelong-Learning-LTH</a>.",
+    "title": "Long Live the Lottery: The Existence of Winning Tickets in Lifelong Learning",
+    "authors": [
+      "Tianlong Chen",
+      "Zhenyu Zhang",
+      "Sijia Liu",
+      "Shiyu Chang",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Shiyu_Chang2",
+      "~Zhangyang_Wang1",
+      "~Tianlong_Chen1",
+      "~Sijia_Liu1",
+      "~Zhenyu_Zhang4"
+    ],
+    "rank": 343
+  },
+  {
+    "url": "https://openreview.net/forum?id=v_1Soh8QUNc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "While energy-based models (EBMs) exhibit a number of desirable properties, training and sampling on high-dimensional datasets remains challenging. Inspired by recent progress on diffusion probabilistic models, we present a diffusion recovery likelihood method to tractably learn and sample from a sequence of EBMs trained on increasingly noisy versions of a dataset. Each EBM is trained with recovery likelihood,  which maximizes the conditional probability of the data at a certain noise level given their noisy versions at a higher noise level. Optimizing recovery likelihood is more tractable than marginal likelihood, as sampling from the conditional distributions is much easier than sampling from the marginal distributions. After training, synthesized images can be generated by the sampling process that initializes from Gaussian white noise distribution and progressively samples the conditional distributions at decreasingly lower noise levels.  Our method generates high fidelity samples on various image datasets. On unconditional CIFAR-10 our method achieves FID 9.58 and inception score 8.30, superior to the majority of GANs. Moreover, we demonstrate that unlike previous work on EBMs, our long-run MCMC samples from the conditional distributions do not diverge and still represent realistic images, allowing us to accurately estimate the normalized density of data even for high-dimensional datasets. Our implementation is available at \\url{<a href=\"https://github.com/ruiqigao/recovery_likelihood}\" target=\"_blank\" rel=\"nofollow\">https://github.com/ruiqigao/recovery_likelihood}</a>.\n",
+    "title": "Learning Energy-Based Models by Diffusion Recovery Likelihood",
+    "authors": [
+      "Ruiqi Gao",
+      "Yang Song",
+      "Ben Poole",
+      "Ying Nian Wu",
+      "Diederik P Kingma"
+    ],
+    "emails": [
+      "~Yang_Song1",
+      "~Ruiqi_Gao2",
+      "~Ying_Nian_Wu1",
+      "~Diederik_P_Kingma1",
+      "~Ben_Poole1"
+    ],
+    "rank": 344
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y87Ri-GNHYu",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      8
+    ],
+    "rating": "6.69",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Complex, multi-task problems have proven to be difficult to solve efficiently in a sparse-reward reinforcement learning setting. In order to be sample efficient, multi-task learning requires reuse and sharing of low-level policies. To facilitate the automatic decomposition of hierarchical tasks, we propose the use of step-by-step human demonstrations in the form of natural language instructions and action trajectories. We introduce a dataset of such demonstrations in a crafting-based grid world. Our model consists of a high-level language generator and low-level policy, conditioned on language. We find that human demonstrations help solve the most complex tasks. We also find that incorporating natural language allows the model to generalize to unseen tasks in a zero-shot setting and to learn quickly from a few demonstrations. Generalization is not only reflected in the actions of the agent, but also in the generated natural language instructions in unseen tasks. Our approach also gives our trained agent interpretable behaviors because it is able to generate a sequence of high-level descriptions of its actions.",
+    "title": "Ask Your Humans: Using Human Instructions to Improve Generalization in Reinforcement Learning",
+    "authors": [
+      "Valerie Chen",
+      "Abhinav Gupta",
+      "Kenneth Marino"
+    ],
+    "emails": [
+      "~Abhinav_Gupta1",
+      "~Kenneth_Marino1",
+      "~Valerie_Chen2"
+    ],
+    "rank": 345
+  },
+  {
+    "url": "https://openreview.net/forum?id=UvBPbpvHRj-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.69",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Current approaches for uncertainty estimation in deep learning often produce too confident results. Bayesian Neural Networks (BNNs) model uncertainty in the space of weights, which is usually high-dimensional and limits the quality of variational approximations. The more recent functional BNNs (fBNNs) address this only partially because, although the prior is specified in the space of functions, the posterior approximation is still defined in terms of stochastic weights. In this work we propose to move uncertainty from the weights (which are deterministic) to the activation function. Specifically, the activations are modelled with simple 1D Gaussian Processes (GP), for which a triangular kernel inspired by the ReLu non-linearity is explored. Our experiments show that activation-level stochasticity provides more reliable uncertainty estimates than BNN and fBNN, whereas it performs competitively in standard prediction tasks. We also study the connection with deep GPs, both theoretically and empirically. More precisely, we show that activation-level uncertainty requires fewer inducing points and is better suited for deep architectures.",
+    "title": "Activation-level uncertainty in deep neural networks",
+    "authors": [
+      "Pablo Morales-Alvarez",
+      "Daniel Hern\u00e1ndez-Lobato",
+      "Rafael Molina",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato"
+    ],
+    "emails": [
+      "~Daniel_Hern\u00e1ndez-Lobato1",
+      "decsai.ugr.es",
+      "~Pablo_Morales-Alvarez1",
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1"
+    ],
+    "rank": 346
+  },
+  {
+    "url": "https://openreview.net/forum?id=AHOs7Sm5H7R",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.69",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Matrix factorization is a simple and natural test-bed to investigate the implicit regularization of gradient descent. Gunasekar et al. (2017) conjectured that gradient flow with infinitesimal initialization converges to the solution that minimizes the nuclear norm, but a series of recent papers argued that the language of norm minimization is not sufficient to give a full characterization for the implicit regularization. In this work, we provide theoretical and empirical evidence that for depth-2 matrix factorization, gradient flow with infinitesimal initialization is mathematically equivalent to a simple heuristic rank minimization algorithm, Greedy Low-Rank Learning, under some reasonable assumptions. This generalizes the rank minimization view from previous works to a much broader setting and enables us to construct counter-examples to refute the conjecture from Gunasekar et al. (2017). We also extend the results to the case where depth &gt;= 3, and we show that the benefit of being deeper is that the above convergence has a much weaker dependence over initialization magnitude so that this rank minimization is more likely to take effect for initialization with practical scale.",
+    "title": "Towards Resolving the Implicit Bias of Gradient Descent for Matrix Factorization: Greedy Low-Rank Learning",
+    "authors": [
+      "Zhiyuan Li",
+      "Yuping Luo",
+      "Kaifeng Lyu"
+    ],
+    "emails": [
+      "~Yuping_Luo1",
+      "~Zhiyuan_Li2",
+      "~Kaifeng_Lyu2"
+    ],
+    "rank": 347
+  },
+  {
+    "url": "https://openreview.net/forum?id=-_Zp7r2-cGK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      8
+    ],
+    "rating": "6.69",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "In probabilistic classification, a discriminative model based on the softmax function has a potential limitation in that it assumes unimodality for each class in the feature space. The mixture model can address this issue, although it leads to an increase in the number of parameters. We propose a sparse classifier based on a discriminative GMM, referred to as a sparse discriminative Gaussian mixture (SDGM). In the SDGM, a GMM-based discriminative model is trained via sparse Bayesian learning. Using this sparse learning framework, we can simultaneously remove redundant Gaussian components and reduce the number of parameters used in the remaining components during learning; this learning method reduces the model complexity, thereby improving the generalization capability. Furthermore, the SDGM can be embedded into neural networks (NNs), such as convolutional NNs, and can be trained in an end-to-end manner. Experimental results demonstrated that the proposed method outperformed the existing softmax-based discriminative models.",
+    "title": "A Discriminative Gaussian Mixture Model with Sparsity",
+    "authors": [
+      "Hideaki Hayashi",
+      "Seiichi Uchida"
+    ],
+    "emails": [
+      "~Hideaki_Hayashi1",
+      "~Seiichi_Uchida1"
+    ],
+    "rank": 348
+  },
+  {
+    "url": "https://openreview.net/forum?id=sgnp-qFYtN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      9,
+      7
+    ],
+    "rating": "6.69",
+    "confidences": [
+      3,
+      2,
+      3,
+      5
+    ],
+    "abstract": "Sparsifying deep neural networks is of paramount interest in many areas, especially when those networks have to be implemented on low-memory devices. In this article, we propose a new formulation of the problem of generating sparse weights for a neural network. By leveraging the properties of standard nonlinear activation functions, we show that the problem is equivalent to an approximate subdifferential inclusion problem. The accuracy of the approximation controls the sparsity. We show that the proposed approach is valid for a broad class of activation functions (ReLU, sigmoid, softmax). We propose an iterative optimization algorithm to induce sparsity whose convergence is guaranteed. Because of the algorithm flexibility, the sparsity can be ensured from partial training data in a minibatch manner. To demonstrate the effectiveness of our method, we perform experiments on various networks in different applicative contexts: image classification, speech recognition, natural language processing, and time-series forecasting.",
+    "title": "Sparsifying Networks via Subdifferential Inclusion",
+    "authors": [
+      "Sagar Verma",
+      "Jean-Christophe Pesquet"
+    ],
+    "emails": [
+      "~Jean-Christophe_Pesquet1",
+      "~Sagar_Verma1"
+    ],
+    "rank": 349
+  },
+  {
+    "url": "https://openreview.net/forum?id=unI5ucw_Jk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Understanding human behavior from observed data is critical for transparency and accountability in decision-making. Consider real-world settings such as healthcare, in which modeling a decision-maker\u2019s policy is challenging\u2014with no access to underlying states, no knowledge of environment dynamics, and no allowance for live experimentation. We desire learning a data-driven representation of decision- making behavior that (1) inheres transparency by design, (2) accommodates partial observability, and (3) operates completely offline. To satisfy these key criteria, we propose a novel model-based Bayesian method for interpretable policy learning (\u201cInterpole\u201d) that jointly estimates an agent\u2019s (possibly biased) belief-update process together with their (possibly suboptimal) belief-action mapping. Through experiments on both simulated and real-world data for the problem of Alzheimer\u2019s disease diagnosis, we illustrate the potential of our approach as an investigative device for auditing, quantifying, and understanding human decision-making behavior.",
+    "title": "Explaining by Imitating: Understanding Decisions by Interpretable Policy Learning",
+    "authors": [
+      "Alihan H\u00fcy\u00fck",
+      "Daniel Jarrett",
+      "Cem Tekin",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Alihan_H\u00fcy\u00fck1",
+      "~Mihaela_van_der_Schaar2",
+      "~Cem_Tekin2",
+      "~Daniel_Jarrett1"
+    ],
+    "rank": 350
+  },
+  {
+    "url": "https://openreview.net/forum?id=sCZbhBvqQaU",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      2,
+      3,
+      5,
+      5
+    ],
+    "abstract": "We study the robustness of reinforcement learning (RL) with adversarially perturbed state observations, which aligns with the setting of many adversarial attacks to deep reinforcement learning (DRL) and is also important for rolling out real-world RL agent under unpredictable sensing noise. With a fixed agent policy, we demonstrate that an optimal adversary to perturb state observations can be found, which is guaranteed to obtain the worst case agent reward. For DRL settings, this leads to a novel empirical adversarial attack to RL agents via a learned adversary that is much stronger than previous ones. To enhance the robustness of an agent, we propose a framework of alternating training with learned adversaries (ATLA), which trains an adversary online together with the agent using policy gradient following the optimal adversarial attack framework. Additionally, inspired by the analysis of state-adversarial Markov decision process (SA-MDP), we show that past states and actions (history) can be useful for learning a robust agent, and we empirically find a LSTM based policy can be more robust under adversaries. Empirical evaluations on a few continuous control environments show that ATLA achieves state-of-the-art performance under strong adversaries. Our code is available at <a href=\"https://github.com/huanzhang12/ATLA_robust_RL\" target=\"_blank\" rel=\"nofollow\">https://github.com/huanzhang12/ATLA_robust_RL</a>.",
+    "title": "Robust Reinforcement Learning on State Observations with Learned Optimal Adversary",
+    "authors": [
+      "Huan Zhang",
+      "Hongge Chen",
+      "Duane S Boning",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Hongge_Chen1",
+      "~Cho-Jui_Hsieh1",
+      "~Duane_S_Boning1",
+      "~Huan_Zhang1"
+    ],
+    "rank": 351
+  },
+  {
+    "url": "https://openreview.net/forum?id=A2gNouoXE7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Due to widespread interest in machine translation and transfer learning, there are numerous algorithms for mapping multiple embeddings to a shared representation space. Recently, these algorithms have been studied in the setting of bilingual lexicon induction where one seeks to align the embeddings of a source and a target language such that translated word pairs lie close to one another in a common representation space. In this paper, we propose a method, Filtered Inner Product Projection (FIPP), for mapping embeddings to a common representation space. As semantic shifts are pervasive across languages and domains, FIPP first identifies the common geometric structure in both embeddings and then, only on the common structure, aligns the Gram matrices of these embeddings. FIPP is applicable even when the source and target embeddings are of differing dimensionalities. Additionally, FIPP provides computational benefits in ease of implementation and is faster to compute than current approaches. Following the baselines in Glavas et al. 2019, we evaluate FIPP both in the context of bilingual lexicon induction and downstream language tasks. We show that FIPP outperforms existing methods on the XLING BLI dataset for most language pairs while also providing robust performance across downstream tasks. ",
+    "title": "Filtered Inner Product Projection for Crosslingual Embedding Alignment",
+    "authors": [
+      "Vin Sachidananda",
+      "Ziyi Yang",
+      "Chenguang Zhu"
+    ],
+    "emails": [
+      "~Chenguang_Zhu1",
+      "~Vin_Sachidananda1",
+      "~Ziyi_Yang1"
+    ],
+    "rank": 352
+  },
+  {
+    "url": "https://openreview.net/forum?id=Sc8cY4Jpi3s",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      4,
+      2
+    ],
+    "abstract": "Optimization in machine learning, both theoretical and applied, is presently dominated by first-order gradient methods such as stochastic gradient descent. Second-order optimization methods, that involve second derivatives and/or second order statistics of the data, are far less prevalent despite strong theoretical properties, due to their prohibitive computation, memory and communication costs.   In an attempt to bridge this gap between theoretical and practical optimization, we present a scalable implementation of a second-order preconditioned method (concretely, a variant of full-matrix Adagrad), that along with several critical algorithmic and numerical improvements, provides significant convergence and wall-clock time improvements compared to conventional first-order methods on state-of-the-art deep models. Our novel design effectively utilizes the prevalent heterogeneous hardware architecture for training deep models, consisting of a multicore CPU coupled with multiple accelerator units. We demonstrate superior performance compared to state-of-the-art on very large learning tasks such as machine translation with Transformers, language modeling with BERT, click-through rate prediction on Criteo, and image classification on ImageNet with ResNet-50.",
+    "title": "Towards Practical Second Order Optimization for Deep Learning",
+    "authors": [
+      "Rohan Anil",
+      "Vineet Gupta",
+      "Tomer Koren",
+      "Kevin Regan",
+      "Yoram Singer"
+    ],
+    "emails": [
+      "~Rohan_Anil1",
+      "~Tomer_Koren1",
+      "google.com",
+      "~Yoram_Singer3",
+      "~Vineet_Gupta1"
+    ],
+    "rank": 353
+  },
+  {
+    "url": "https://openreview.net/forum?id=KLH36ELmwIB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      2,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Despite the fast development of differentiable architecture search (DARTS), it suffers from a standing instability issue regarding searching performance, which extremely limits its application. Existing robustifying methods draw clues from the outcome instead of finding out the causing factor. Various indicators such as Hessian eigenvalues are proposed as a signal of performance collapse, and the searching should be stopped once an indicator reaches a preset threshold.\nHowever, these methods tend to easily reject good architectures if thresholds are inappropriately set, let alone the searching is intrinsically noisy. In this paper, we undertake a more subtle and direct approach to resolve the collapse. \nWe first demonstrate that skip connections with a learnable architectural coefficient can easily recover from a disadvantageous state and become dominant.  We conjecture that skip connections profit too much from this privilege, hence causing the collapse for the derived model. Therefore, we propose to factor out this benefit with an auxiliary skip connection, ensuring a fairer competition for all operations. Extensive experiments on various datasets verify that our approach can substantially improve the robustness of DARTS. Our code is available at <a href=\"https://github.com/Meituan-AutoML/DARTS-\" target=\"_blank\" rel=\"nofollow\">https://github.com/Meituan-AutoML/DARTS-</a>",
+    "title": "DARTS-: Robustly Stepping out of Performance Collapse Without Indicators",
+    "authors": [
+      "Xiangxiang Chu",
+      "Xiaoxing Wang",
+      "Bo Zhang",
+      "Shun Lu",
+      "Xiaolin Wei",
+      "Junchi Yan"
+    ],
+    "emails": [
+      "sjtu.edu.cn",
+      "~Xiangxiang_Chu1",
+      "~Bo_Zhang7",
+      "~Junchi_Yan2",
+      "~Shun_Lu1",
+      "meituan.com"
+    ],
+    "rank": 354
+  },
+  {
+    "url": "https://openreview.net/forum?id=opHLcXxYTC_",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Identifying harmful instances, whose absence in a training dataset improves model performance, is important for building better machine learning models. \nAlthough previous studies have succeeded in estimating harmful instances under supervised settings, they cannot be trivially extended to generative adversarial networks (GANs).\nThis is because previous approaches require that (i) the absence of a training instance directly affects the loss value and that (ii) the change in the loss directly measures the harmfulness of the instance for the performance of a model. \nIn GAN training, however, neither of the requirements is satisfied. \nThis is because, (i) the generator\u2019s loss is not directly affected by the training instances as they are not part of the generator's training steps, and (ii) the values of GAN's losses normally do not capture the generative performance of a model.\nTo this end, (i) we propose an influence estimation method that uses the Jacobian of the gradient of the generator's loss with respect to the discriminator\u2019s parameters (and vice versa) to trace how the absence of an instance in the discriminator\u2019s training affects the generator\u2019s parameters, and (ii) we propose a novel evaluation scheme, in which we assess harmfulness of each training instance on the basis of how GAN evaluation metric (e.g., inception score) is expected to change due to the removal of the instance.\nWe experimentally verified that our influence estimation method correctly inferred the changes in GAN evaluation metrics.\nWe also demonstrated that the removal of the identified harmful instances effectively improved the model\u2019s generative performance with respect to various GAN evaluation metrics.",
+    "title": "Influence Estimation for Generative Adversarial Networks",
+    "authors": [
+      "Naoyuki Terashita",
+      "Hiroki Ohashi",
+      "Yuichi Nonaka",
+      "Takashi Kanemaru"
+    ],
+    "emails": [
+      "~Hiroki_Ohashi1",
+      "~Naoyuki_Terashita1",
+      "hitachi.com"
+    ],
+    "rank": 355
+  },
+  {
+    "url": "https://openreview.net/forum?id=hJmtwocEqzc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Facial recognition systems are increasingly deployed by private corporations, government agencies, and contractors for consumer services and mass surveillance programs alike.  These systems are typically built by scraping social media profiles for user images.  Adversarial perturbations have been proposed for bypassing facial recognition systems.  However, existing methods fail on full-scale systems and commercial APIs.  We develop our own adversarial filter that accounts for the entire image processing pipeline and is demonstrably effective against industrial-grade pipelines that include face detection and large scale databases.  Additionally, we release an easy-to-use webtool that significantly degrades the accuracy of Amazon Rekognition and the Microsoft Azure Face Recognition API, reducing the accuracy of each to below 1%.",
+    "title": "LowKey: Leveraging Adversarial Attacks to Protect Social Media Users from Facial Recognition",
+    "authors": [
+      "Valeriia Cherepanova",
+      "Micah Goldblum",
+      "Harrison Foley",
+      "Shiyuan Duan",
+      "John P Dickerson",
+      "Gavin Taylor",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~John_P_Dickerson1",
+      "umd.edu",
+      "usna.edu",
+      "~Valeriia_Cherepanova1",
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1",
+      "~Gavin_Taylor1"
+    ],
+    "rank": 356
+  },
+  {
+    "url": "https://openreview.net/forum?id=6xHJ37MVxxp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Though convolutional neural networks (CNNs) have demonstrated remarkable ability in learning discriminative features, they often generalize poorly to unseen domains. Domain generalization aims to address this problem by learning from a set of source domains a model that is generalizable to any unseen domain. In this paper, a novel approach is proposed based on probabilistically mixing instance-level feature statistics of training samples across source domains. Our method, termed MixStyle, is motivated by the observation that visual domain is closely related to image style (e.g., photo vs.~sketch images). Such style information is captured by the bottom layers of a CNN where our proposed style-mixing takes place. Mixing styles of training instances results in novel domains being synthesized implicitly, which increase the domain diversity of the source domains, and hence the generalizability of the trained model. MixStyle fits into mini-batch training perfectly and is extremely easy to implement. The effectiveness of MixStyle is demonstrated on a wide range of tasks including category classification, instance retrieval and reinforcement learning.",
+    "title": "Domain Generalization with MixStyle",
+    "authors": [
+      "Kaiyang Zhou",
+      "Yongxin Yang",
+      "Yu Qiao",
+      "Tao Xiang"
+    ],
+    "emails": [
+      "~Tao_Xiang1",
+      "~Kaiyang_Zhou1",
+      "~Yongxin_Yang1",
+      "~Yu_Qiao1"
+    ],
+    "rank": 357
+  },
+  {
+    "url": "https://openreview.net/forum?id=QtTKTdVrFBB",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      4,
+      8,
+      8
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Transformers are state-of-the-art models for a variety of sequence modeling tasks. At their core is an attention function which models pairwise interactions between the inputs at every timestep. While attention is powerful, it does not scale efficiently to long sequences due to its quadratic time and space complexity in the sequence length. We propose RFA, a linear time and space attention that uses random feature methods to approximate the softmax function, and explore its application in transformers. RFA can be used as a drop-in replacement for conventional softmax attention and offers a straightforward way of learning with recency bias through an optional gating mechanism. Experiments on language modeling and machine translation demonstrate that RFA achieves similar or better performance compared to strong transformer baselines. In the machine translation experiment, RFA decodes twice as fast as a vanilla transformer. Compared to existing efficient transformer variants, RFA is competitive in terms of both accuracy and efficiency on three long text classification datasets. Our analysis shows that RFA\u2019s efficiency gains are especially notable on long sequences, suggesting that RFA will be particularly useful in tasks that require working with large inputs, fast decoding speed, or low memory footprints.",
+    "title": "Random Feature Attention",
+    "authors": [
+      "Hao Peng",
+      "Nikolaos Pappas",
+      "Dani Yogatama",
+      "Roy Schwartz",
+      "Noah Smith",
+      "Lingpeng Kong"
+    ],
+    "emails": [
+      "~Lingpeng_Kong1",
+      "~Dani_Yogatama2",
+      "~Nikolaos_Pappas1",
+      "~Hao_Peng4",
+      "~Roy_Schwartz1",
+      "~Noah_Smith1"
+    ],
+    "rank": 358
+  },
+  {
+    "url": "https://openreview.net/forum?id=tkAtoZkcUnm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Thompson Sampling (TS) is one of the most effective algorithms for solving contextual multi-armed bandit problems. In this paper, we propose a new algorithm, called Neural Thompson Sampling, which adapts deep neural networks for both exploration and exploitation. At the core of our algorithm is a novel posterior distribution of the reward, where its mean is the neural network approximator, and its variance is built upon the neural tangent features of the corresponding neural network. We prove that, provided the underlying reward function is bounded, the proposed algorithm is guaranteed to achieve a cumulative regret of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.056em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>T</mi><mrow><mn>1</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, which matches the regret of other contextual bandit algorithms in terms of total round number <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container>. Experimental comparisons with other benchmark bandit algorithms on various data sets corroborate our theory.",
+    "title": "Neural Thompson Sampling",
+    "authors": [
+      "Weitong ZHANG",
+      "Dongruo Zhou",
+      "Lihong Li",
+      "Quanquan Gu"
+    ],
+    "emails": [
+      "~Weitong_ZHANG1",
+      "~Quanquan_Gu1",
+      "~Dongruo_Zhou1",
+      "~Lihong_Li1"
+    ],
+    "rank": 359
+  },
+  {
+    "url": "https://openreview.net/forum?id=6zaTwpNSsQ2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Training Deep Neural Networks (DNN) with high efficiency can be difficult to achieve with native floating-point representations and commercially available hardware. Specialized arithmetic with custom acceleration offers perhaps the most promising alternative. Ongoing research is trending towards narrow floating-point representations, called minifloats, that pack more operations for a given silicon area and consume less power. In this paper, we introduce Block Minifloat (BM), a new spectrum of minifloat formats capable of training DNNs end-to-end with only 4-8 bit weight, activation and gradient tensors. While standard floating-point representations have two degrees of freedom, via the exponent and mantissa, BM exposes the exponent bias as an additional field for optimization. Crucially, this enables training with fewer exponent bits, yielding dense integer-like hardware for fused multiply-add (FMA) operations. For ResNet trained on ImageNet, 6-bit BM achieves almost no degradation in floating-point accuracy with FMA units that are <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>4.1</mn><mo>\u00d7</mo><mo stretchy=\"false\">(</mo><mn>23.9</mn><mo>\u00d7</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> smaller and consume <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2.3</mn><mo>\u00d7</mo><mo stretchy=\"false\">(</mo><mn>16.1</mn><mo>\u00d7</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> less energy than FP8 (FP32). Furthermore, our 8-bit BM format matches floating-point accuracy while delivering a higher computational density and faster expected training times.",
+    "title": "A Block Minifloat Representation for Training Deep Neural Networks",
+    "authors": [
+      "Sean Fox",
+      "Seyedramin Rasoulinezhad",
+      "Julian Faraone",
+      "david boland",
+      "Philip Leong"
+    ],
+    "emails": [
+      "~Julian_Faraone1",
+      "~Sean_Fox2",
+      "~Philip_Leong1",
+      "sydney.edu.au"
+    ],
+    "rank": 360
+  },
+  {
+    "url": "https://openreview.net/forum?id=H0syOoy3Ash",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      4,
+      2
+    ],
+    "abstract": "Advances in generative modeling and adversarial learning have given rise to renewed interest in smooth games. However, the absence of symmetry in the matrix of second derivatives poses challenges that are not present in the classical minimization framework. While a rich theory of average-case analysis has been developed for minimization problems, little is known in the context of smooth games. In this work we take a first step towards closing this gap by developing average-case optimal first-order methods for a subset of smooth games. \nWe make the following three main contributions. First, we show that for zero-sum bilinear games the average-case optimal method is the optimal method for the minimization of the Hamiltonian. Second, we provide an explicit expression for the optimal method corresponding to normal matrices, potentially non-symmetric. Finally, we specialize it to matrices with eigenvalues located in a disk and show a provable speed-up compared to worst-case optimal algorithms. We illustrate our findings through benchmarks with a varying degree of mismatch with our assumptions.",
+    "title": "Average-case Acceleration for Bilinear Games and Normal Matrices",
+    "authors": [
+      "Carles Domingo-Enrich",
+      "Fabian Pedregosa",
+      "Damien Scieur"
+    ],
+    "emails": [
+      "~Damien_Scieur3",
+      "~Carles_Domingo-Enrich1",
+      "~Fabian_Pedregosa1"
+    ],
+    "rank": 361
+  },
+  {
+    "url": "https://openreview.net/forum?id=de11dbHzAMF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Multi-Task Learning (MTL) networks have emerged as a promising method for transferring learned knowledge across different tasks. However, MTL must deal with challenges such as: overfitting to low resource tasks, catastrophic forgetting, and negative task transfer, or learning interference. Often, in Natural Language Processing (NLP), a separate model per task is needed to obtain the best performance. However, many fine-tuning approaches are both parameter inefficient, i.e., potentially involving one new model per task, and highly susceptible to losing knowledge acquired during pretraining. We propose a novel Transformer based Adapter consisting of a new conditional attention mechanism as well as a set of task-conditioned modules that facilitate weight sharing. Through this construction, we achieve more efficient parameter sharing and mitigate forgetting by keeping half of the weights of a pretrained model fixed. We also use a new multi-task data sampling strategy to mitigate the negative effects of data imbalance across tasks. Using this approach, we are able to surpass single task fine-tuning methods while being parameter and data efficient (using around 66% of the data). Compared to other BERT Large methods on GLUE, our 8-task model surpasses other Adapter methods by 2.8% and our 24-task model outperforms by 0.7-1.0% models that use MTL and single task fine-tuning. We show that a larger variant of our single multi-task model approach performs competitively across 26 NLP tasks and yields state-of-the-art results on a number of test and development sets.",
+    "title": "Conditionally Adaptive Multi-Task Learning: Improving Transfer Learning in NLP Using Fewer Parameters & Less Data",
+    "authors": [
+      "Jonathan Pilault",
+      "Amine El hattami",
+      "Christopher Pal"
+    ],
+    "emails": [
+      "~Jonathan_Pilault1",
+      "~Amine_El_hattami1",
+      "~Christopher_Pal1"
+    ],
+    "rank": 362
+  },
+  {
+    "url": "https://openreview.net/forum?id=jEYKjPE1xYN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Automating molecular design using deep reinforcement learning (RL) has the potential to greatly accelerate the search for novel materials. Despite recent progress on leveraging graph representations to design molecules, such methods are fundamentally limited by the lack of three-dimensional (3D) information. In light of this, we propose a novel actor-critic architecture for 3D molecular design that can generate molecular structures unattainable with previous approaches. This is achieved by exploiting the symmetries of the design process through a rotationally covariant state-action representation based on a spherical harmonics series expansion. We demonstrate the benefits of our approach on several 3D molecular design tasks, where we find that building in such symmetries significantly improves generalization and the quality of generated molecules.",
+    "title": "Symmetry-Aware Actor-Critic for 3D Molecular Design",
+    "authors": [
+      "Gregor N. C. Simm",
+      "Robert Pinsler",
+      "G\u00e1bor Cs\u00e1nyi",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato"
+    ],
+    "emails": [
+      "cam.ac.uk",
+      "~Gregor_N._C._Simm1",
+      "~Robert_Pinsler1",
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1"
+    ],
+    "rank": 363
+  },
+  {
+    "url": "https://openreview.net/forum?id=pqZV_srUVmK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      7,
+      8
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      3,
+      4,
+      1
+    ],
+    "abstract": "We study the global convergence and global optimality of actor-critic, one of the most popular families of reinforcement learning algorithms. While most existing works on actor-critic employ bi-level or two-timescale updates, we focus on the more practical single-timescale setting, where the actor and critic are updated simultaneously. Specifically, in each iteration, the critic update is obtained by applying the Bellman evaluation operator only once while the actor is updated in the policy gradient direction computed using the critic. Moreover, we consider two function approximation settings where both the actor and critic are represented by linear or deep neural networks. For both cases, we prove that the actor sequence converges to a globally optimal policy at a sublinear <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>K</mi><mrow><mo>\u2212</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> rate, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> is the number of iterations. To the best of our knowledge, we establish the rate of convergence and global optimality of single-timescale actor-critic with linear function approximation for the first time. Moreover, under the broader scope of policy optimization with nonlinear function approximation, we prove that actor-critic with deep neural network finds the globally optimal policy at a sublinear rate for the first time. ",
+    "title": "Single-Timescale Actor-Critic Provably Finds Globally Optimal Policy",
+    "authors": [
+      "Zuyue Fu",
+      "Zhuoran Yang",
+      "Zhaoran Wang"
+    ],
+    "emails": [
+      "~Zhuoran_Yang1",
+      "~Zuyue_Fu1",
+      "~Zhaoran_Wang1"
+    ],
+    "rank": 364
+  },
+  {
+    "url": "https://openreview.net/forum?id=6BRLOfrMhW",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Bloom filters are space-efficient probabilistic data structures that are used to test whether an element is a member of a set, and may return false positives.  Recently, variations referred to as learned Bloom filters were developed that can provide improved performance in terms of the rate of false positives, by using a learned model for the represented set.  However, previous methods for learned Bloom filters do not take full advantage of the learned model.  Here we show how to frame the problem of optimal model utilization as an optimization problem, and using our framework derive algorithms that can achieve near-optimal performance in many cases.",
+    "title": "Partitioned Learned Bloom Filters",
+    "authors": [
+      "Kapil Vaidya",
+      "Eric Knorr",
+      "Michael Mitzenmacher",
+      "Tim Kraska"
+    ],
+    "emails": [
+      "g.harvard.edu",
+      "~Tim_Kraska1",
+      "~Michael_Mitzenmacher1",
+      "~Kapil_Vaidya1"
+    ],
+    "rank": 365
+  },
+  {
+    "url": "https://openreview.net/forum?id=LLoe0U9ShkN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "We derive the optimal approximate posterior over the top-layer weights in a Bayesian neural network for regression, and show that it exhibits strong dependencies on the lower-layer weights. We adapt this result to develop a correlated approximate posterior over the weights at all layers in a Bayesian neural network. We extend this approach to deep Gaussian processes, unifying inference in the two model classes. Our approximate posterior uses learned \"global\" inducing points, which are defined only at the input layer and propagated through the network to obtain inducing inputs at subsequent layers. By contrast, standard, \"local\", inducing point methods from the deep Gaussian process literature optimise a separate set of inducing inputs at every layer, and thus do not model correlations across layers. Our method gives state-of-the-art performance for a variational Bayesian method, without data augmentation or tempering, on CIFAR-10 of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>86.7</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>.",
+    "title": "Global inducing point variational posteriors for Bayesian neural networks and deep Gaussian processes",
+    "authors": [
+      "Sebastian W. Ober",
+      "Laurence Aitchison"
+    ],
+    "emails": [
+      "~Laurence_Aitchison1",
+      "~Sebastian_W._Ober1"
+    ],
+    "rank": 366
+  },
+  {
+    "url": "https://openreview.net/forum?id=NomEDgIEBwE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose methods to strengthen the invariance properties of representations obtained by contrastive learning. While existing approaches implicitly induce a degree of invariance as representations are learned, we look to more directly enforce invariance in the encoding process. To this end, we first introduce a training objective for contrastive learning that uses a novel regularizer to control how the representation changes under transformation. We show that representations trained with this objective perform better on downstream tasks and are more robust to the introduction of nuisance transformations at test time. Second, we propose a change to how test time representations are generated by introducing a feature averaging approach that combines encodings from multiple transformations of the original input, finding that this leads to across the board performance gains. Finally, we introduce the novel Spirograph dataset to explore our ideas in the context of a differentiable generative process with multiple downstream tasks, showing that our techniques for learning invariance are highly beneficial.",
+    "title": "Improving Transformation Invariance in Contrastive Representation Learning",
+    "authors": [
+      "Adam Foster",
+      "Rattana Pukdee",
+      "Tom Rainforth"
+    ],
+    "emails": [
+      "~Tom_Rainforth1",
+      "~Rattana_Pukdee1",
+      "~Adam_Foster1"
+    ],
+    "rank": 367
+  },
+  {
+    "url": "https://openreview.net/forum?id=4T489T4yav",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Segmented models are widely used to describe non-stationary sequential data with discrete change points. Their estimation usually requires solving a mixed discrete-continuous optimization problem, where the segmentation is the discrete part and all other model parameters are continuous. A number of estimation algorithms have been developed that are highly specialized for their specific model assumptions. The dependence on non-standard algorithms makes it hard to integrate segmented models in state-of-the-art deep learning architectures that critically depend on gradient-based optimization techniques. In this work, we formulate a relaxed variant of segmented models that enables joint estimation of all model parameters, including the segmentation, with gradient descent. We build on recent advances in learning continuous warping functions and propose a novel family of warping functions based on the two-sided power (TSP) distribution. TSP-based warping functions are differentiable, have simple closed-form expressions, and can represent segmentation functions exactly. Our formulation includes the important class of segmented generalized linear models as a special case, which makes it highly versatile. We use our approach to model the spread of COVID-19 with Poisson regression, apply it on a change point detection task, and learn classification models with concept drift. The experiments show that our approach effectively learns all these tasks with standard algorithms for gradient descent.",
+    "title": "Differentiable Segmentation of Sequences",
+    "authors": [
+      "Erik Scharw\u00e4chter",
+      "Jonathan Lennartz",
+      "Emmanuel M\u00fcller"
+    ],
+    "emails": [
+      "uni-bonn.de",
+      "~Erik_Scharw\u00e4chter1",
+      "cs.tu-dortmund.de"
+    ],
+    "rank": 368
+  },
+  {
+    "url": "https://openreview.net/forum?id=gl3D-xY7wLq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      8
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "We assess the tendency of state-of-the-art object recognition models to depend on signals from image backgrounds. We create a toolkit for disentangling foreground and background signal on ImageNet images, and find that (a) models can achieve non-trivial accuracy by relying on the background alone, (b) models often misclassify images even in the presence of correctly classified foregrounds--up to 88% of the time with adversarially chosen backgrounds, and (c) more accurate models tend to depend on backgrounds less. Our analysis of backgrounds brings us closer to understanding which correlations machine learning models use, and how they determine models' out of distribution performance.\n",
+    "title": "Noise or Signal: The Role of Image Backgrounds in Object Recognition",
+    "authors": [
+      "Kai Yuanqing Xiao",
+      "Logan Engstrom",
+      "Andrew Ilyas",
+      "Aleksander Madry"
+    ],
+    "emails": [
+      "~Kai_Yuanqing_Xiao1",
+      "~Logan_Engstrom1",
+      "~Aleksander_Madry1",
+      "~Andrew_Ilyas1"
+    ],
+    "rank": 369
+  },
+  {
+    "url": "https://openreview.net/forum?id=ct8_a9h1M",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Dropout has been demonstrated as a simple and effective module to not only regularize the training process of deep neural networks, but also provide the uncertainty estimation for prediction. However, the quality of uncertainty estimation is highly dependent on the dropout probabilities. Most current models use the same dropout distributions across all data samples due to its simplicity.  Despite the potential gains in the flexibility of modeling uncertainty, sample-dependent dropout, on the other hand, is less explored as it often encounters scalability issues or involves non-trivial model changes.  In this paper, we propose contextual dropout with an efficient structural design as a simple and scalable sample-dependent dropout module, which can be applied to a wide range of models at the expense of only slightly increased memory and computational cost. We learn the dropout probabilities with a variational objective, compatible with both Bernoulli dropout and Gaussian dropout. We apply the contextual dropout module to various models with applications to image classification and visual question answering and demonstrate the scalability of the method with large-scale datasets, such as ImageNet and VQA 2.0. Our experimental results show that the proposed method outperforms baseline methods in terms of both accuracy and quality of uncertainty estimation.",
+    "title": "Contextual Dropout: An Efficient Sample-Dependent Dropout Module",
+    "authors": [
+      "XINJIE FAN",
+      "Shujian Zhang",
+      "Korawat Tanwisuth",
+      "Xiaoning Qian",
+      "Mingyuan Zhou"
+    ],
+    "emails": [
+      "~Shujian_Zhang1",
+      "utexas.edu",
+      "~Mingyuan_Zhou1",
+      "~XINJIE_FAN2",
+      "~Xiaoning_Qian2"
+    ],
+    "rank": 370
+  },
+  {
+    "url": "https://openreview.net/forum?id=pW2Q2xLwIMD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      7,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper studies few-shot learning via representation learning, where one uses <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> source tasks with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>n</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> data per task to learn a representation in order to reduce the sample complexity of a target task for which there is only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-msub space=\"4\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>n</mi><mn>2</mn></msub><mo stretchy=\"false\">(</mo><mo>\u226a</mo><msub><mi>n</mi><mn>1</mn></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> data. Specifically, we focus on the setting where there exists a good common representation between source and target, and our goal is to understand how much a sample size reduction is possible. First, we study the setting where this common representation is low-dimensional and provide a risk bound of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mrow size=\"s\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-mrow></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mrow size=\"s\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.164em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-mrow></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mfrac space=\"3\"><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msub size=\"s\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.164em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><mfrac><mrow><mi>d</mi><mi>k</mi></mrow><mrow><msub><mi>n</mi><mn>1</mn></msub><mi>T</mi></mrow></mfrac><mo>+</mo><mfrac><mi>k</mi><msub><mi>n</mi><mn>2</mn></msub></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> on the target task for the linear representation class; here <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> is the ambient input dimension and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi><mo stretchy=\"false\">(</mo><mo>\u226a</mo><mi>d</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> is the dimension of the representation. This result bypasses the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><mi>T</mi></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> barrier under the i.i.d. task assumption, and can capture the desired property that all <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>n</mi><mn>1</mn></msub><mi>T</mi></math></mjx-assistive-mml></mjx-container> samples from source tasks can be \\emph{pooled} together for representation learning. We further extend this result to handle a general representation function class and obtain a similar result. Next, we consider the setting where the common representation may be high-dimensional but is capacity-constrained (say in norm); here, we again demonstrate the advantage of representation learning in both high-dimensional linear regression and neural networks, and show that representation learning can fully utilize all <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>n</mi><mn>1</mn></msub><mi>T</mi></math></mjx-assistive-mml></mjx-container> samples from source tasks.",
+    "title": "Few-Shot Learning via Learning the Representation, Provably",
+    "authors": [
+      "Simon Shaolei Du",
+      "Wei Hu",
+      "Sham M. Kakade",
+      "Jason D. Lee",
+      "Qi Lei"
+    ],
+    "emails": [
+      "~Sham_M._Kakade1",
+      "~Jason_D._Lee1",
+      "~Qi_Lei1",
+      "~Simon_Shaolei_Du1",
+      "~Wei_Hu1"
+    ],
+    "rank": 371
+  },
+  {
+    "url": "https://openreview.net/forum?id=_ptUyYP19mP",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      9,
+      8
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Efficient exploration under sparse rewards remains a key challenge in deep reinforcement learning. To guide exploration, previous work makes extensive use of intrinsic reward (IR). There are many heuristics for IR, including visitation counts, curiosity, and state-difference. In this paper, we analyze the pros and cons of each method and propose the regulated difference of inverse visitation counts as a simple but effective criterion for IR. The criterion helps the agent explore Beyond the Boundary of explored regions and mitigates common issues in count-based methods, such as short-sightedness and detachment. The resulting method, BeBold, solves the 12 most challenging procedurally-generated tasks in MiniGridwith just 120M environment steps, without any curriculum learning. In comparison, previous SoTA only solves 50%of the tasks. BeBold also achieves SoTAon multiple tasks in NetHack, a popular rogue-like game that contains more challenging procedurally-generated environments.",
+    "title": "BeBold: Exploration Beyond the Boundary of Explored Regions",
+    "authors": [
+      "Tianjun Zhang",
+      "Huazhe Xu",
+      "Xiaolong Wang",
+      "Yi Wu",
+      "Kurt Keutzer",
+      "Joseph E. Gonzalez",
+      "Yuandong Tian"
+    ],
+    "emails": [
+      "~Kurt_Keutzer1",
+      "~Huazhe_Xu1",
+      "~Joseph_E._Gonzalez1",
+      "~Xiaolong_Wang3",
+      "~Tianjun_Zhang1",
+      "~Yuandong_Tian1",
+      "~Yi_Wu1"
+    ],
+    "rank": 372
+  },
+  {
+    "url": "https://openreview.net/forum?id=_i3ASPp12WS",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.67",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Deep neural networks are known to be vulnerable to adversarial examples, where a perturbation in the input space leads to an amplified shift in the latent network representation. In this paper, we combine canonical supervised learning with self-supervised representation learning, and present Self-supervised Online Adversarial Purification (SOAP), a novel defense strategy that uses a self-supervised loss to purify adversarial examples at test-time. Our approach leverages the label-independent nature of self-supervised signals and counters the adversarial perturbation with respect to the self-supervised tasks. SOAP yields competitive robust accuracy against state-of-the-art adversarial training and purification methods, with considerably less training complexity. In addition, our approach is robust even when adversaries are given the knowledge of the purification defense strategy. To the best of our knowledge, our paper is the first that generalizes the idea of using self-supervised signals to perform online test-time purification.",
+    "title": "Online Adversarial Purification based on Self-supervised Learning",
+    "authors": [
+      "Changhao Shi",
+      "Chester Holtz",
+      "Gal Mishne"
+    ],
+    "emails": [
+      "~Chester_Holtz1",
+      "~Gal_Mishne1",
+      "~Changhao_Shi1"
+    ],
+    "rank": 373
+  },
+  {
+    "url": "https://openreview.net/forum?id=tnSo6VRLmT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6
+    ],
+    "rating": "6.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we present a novel approach for conformal prediction (CP), in which we aim to identify a set of promising prediction candidates---in place of a single prediction. This set is guaranteed to contain a correct answer with high probability, and is well-suited for many open-ended classification tasks. In the standard CP paradigm, the predicted set can often be unusably large and also costly to obtain. This is particularly pervasive in settings where the correct answer is not unique, and the number of total possible answers is high. We first expand the CP correctness criterion to allow for additional, inferred \"admissible\" answers, which can substantially reduce the size of the predicted set while still providing valid performance guarantees. Second, we amortize costs by conformalizing prediction cascades, in which we aggressively prune implausible labels early on by using progressively stronger classifiers---again, while still providing valid performance guarantees. We demonstrate the empirical effectiveness of our approach for multiple applications in natural language processing and computational chemistry for drug discovery.",
+    "title": "Efficient Conformal Prediction via Cascaded Inference with Expanded Admission",
+    "authors": [
+      "Adam Fisch",
+      "Tal Schuster",
+      "Tommi S. Jaakkola",
+      "Regina Barzilay"
+    ],
+    "emails": [
+      "~Regina_Barzilay1",
+      "~Adam_Fisch2",
+      "~Tommi_S._Jaakkola1",
+      "~Tal_Schuster1"
+    ],
+    "rank": 374
+  },
+  {
+    "url": "https://openreview.net/forum?id=-ODN6SbiUU",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      9
+    ],
+    "rating": "6.65",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The recent research in semi-supervised learning (SSL) is mostly dominated by consistency regularization based methods which achieve strong performance. However, they heavily rely on domain-specific data augmentations, which are not easy to generate for all data modalities. Pseudo-labeling (PL) is a general SSL approach that does not have this constraint but performs relatively poorly in its original formulation. We argue that PL underperforms due to the erroneous high confidence predictions from poorly calibrated models; these predictions generate many incorrect pseudo-labels, leading to noisy training. We propose an uncertainty-aware pseudo-label selection (UPS) framework which improves pseudo labeling accuracy by drastically reducing the amount of noise encountered in the training process. Furthermore, UPS generalizes the pseudo-labeling process, allowing for the creation of negative pseudo-labels; these negative pseudo-labels can be used for multi-label classification as well as negative learning to improve the single-label classification. We achieve strong performance when compared to recent SSL methods on the CIFAR-10 and CIFAR-100 datasets. Also, we demonstrate the versatility of our method on the video dataset UCF-101 and the multi-label dataset Pascal VOC.",
+    "title": "In Defense of Pseudo-Labeling: An Uncertainty-Aware Pseudo-label Selection Framework for Semi-Supervised Learning",
+    "authors": [
+      "Mamshad Nayeem Rizve",
+      "Kevin Duarte",
+      "Yogesh S Rawat",
+      "Mubarak Shah"
+    ],
+    "emails": [
+      "~Yogesh_S_Rawat1",
+      "~Mamshad_Nayeem_Rizve1",
+      "~Mubarak_Shah3",
+      "~Kevin_Duarte1"
+    ],
+    "rank": 375
+  },
+  {
+    "url": "https://openreview.net/forum?id=eIHYL6fpbkA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      5,
+      1,
+      4,
+      4
+    ],
+    "abstract": "Several data augmentation methods deploy unlabeled-in-distribution (UID) data to bridge the gap between the training and inference of neural networks. However, these methods have clear limitations in terms of availability of UID data and dependence of algorithms on pseudo-labels. Herein, we propose a data augmentation method to improve generalization in both adversarial and standard learning by using out-of-distribution (OOD) data that are devoid of the abovementioned issues. We show how to improve generalization theoretically using OOD data in each learning scenario and complement our theoretical analysis with experiments on CIFAR-10, CIFAR-100, and a subset of ImageNet. The results indicate that undesirable features are shared even among image data that seem to have little correlation from a human point of view. We also present the advantages of the proposed method through comparison with other data augmentation methods, which can be used in the absence of UID data. Furthermore, we demonstrate that the proposed method can further improve the existing state-of-the-art adversarial training.",
+    "title": "Removing Undesirable Feature Contributions Using Out-of-Distribution Data",
+    "authors": [
+      "Saehyung Lee",
+      "Changhwa Park",
+      "Hyungyu Lee",
+      "Jihun Yi",
+      "Jonghyun Lee",
+      "Sungroh Yoon"
+    ],
+    "emails": [
+      "~Hyungyu_Lee2",
+      "~Jihun_Yi1",
+      "~Saehyung_Lee1",
+      "~Sungroh_Yoon1",
+      "~Jonghyun_Lee1",
+      "~Changhwa_Park1"
+    ],
+    "rank": 376
+  },
+  {
+    "url": "https://openreview.net/forum?id=ac288vnG_7U",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Many sequential decision making tasks can be viewed as combinatorial optimization problems over a large number of actions. When the cost of evaluating an action is high, even a greedy algorithm, which iteratively picks the best action given the history, is prohibitive to run. In this paper, we aim to learn a greedy heuristic for sequentially selecting actions as a surrogate for invoking the expensive oracle when evaluating an action. In particular, we focus on a class of combinatorial problems that can be solved via submodular maximization (either directly on the objective function or via submodular surrogates). We introduce a data-driven optimization framework based on the submodular-norm loss, a novel loss function that encourages the resulting objective to exhibit diminishing returns. Our framework outputs a surrogate objective that is efficient to train, approximately submodular, and can be made permutation-invariant. The latter two properties allow us to prove strong approximation guarantees for the learned greedy heuristic. Furthermore, we show that our model can be easily integrated with modern deep imitation learning pipelines for sequential prediction tasks. We demonstrate the performance of our algorithm on a variety of batched and sequential optimization tasks, including set cover, active learning, and Bayesian optimization for protein engineering.",
+    "title": "Learning to Make Decisions via Submodular Regularization",
+    "authors": [
+      "Ayya Alieva",
+      "Aiden Aceves",
+      "Jialin Song",
+      "Stephen Mayo",
+      "Yisong Yue",
+      "Yuxin Chen"
+    ],
+    "emails": [
+      "~Jialin_Song1",
+      "~Yisong_Yue1",
+      "gmail.com",
+      "~Stephen_Mayo1",
+      "~Yuxin_Chen1",
+      "caltech.edu"
+    ],
+    "rank": 377
+  },
+  {
+    "url": "https://openreview.net/forum?id=9G5MIc-goqB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Data augmentation is an effective technique to improve the generalization of deep neural networks. However, previous data augmentation methods usually treat the augmented samples equally without considering their individual impacts on the model. To address this, for the augmented samples from the same training example, we propose to assign different weights to them. We construct the maximal expected loss which is the supremum over any reweighted loss on augmented samples. Inspired by adversarial training, we minimize this maximal expected loss (MMEL) and obtain a simple and interpretable closed-form solution: more attention should be paid to augmented samples with large loss values (i.e., harder examples). Minimizing this maximal expected loss enables the model to perform well under any reweighting strategy. The proposed method can generally be applied on top of any data augmentation methods. Experiments are conducted on both natural language understanding tasks with token-level data augmentation, and image classification tasks with commonly-used image augmentation techniques like random crop and horizontal flip. Empirical results show that the proposed method improves the generalization performance of the model.",
+    "title": "Reweighting Augmented Samples by Minimizing the Maximal Expected Loss",
+    "authors": [
+      "Mingyang Yi",
+      "Lu Hou",
+      "Lifeng Shang",
+      "Xin Jiang",
+      "Qun Liu",
+      "Zhi-Ming Ma"
+    ],
+    "emails": [
+      "~Qun_Liu1",
+      "~Xin_Jiang1",
+      "~Mingyang_Yi1",
+      "~Zhi-Ming_Ma1",
+      "~Lu_Hou2",
+      "~Lifeng_Shang1"
+    ],
+    "rank": 378
+  },
+  {
+    "url": "https://openreview.net/forum?id=vkxGQB9f2Vg",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      10
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Backpropagating gradients through random variables is at the heart of numerous machine learning applications. In this paper, we present a general framework for deriving stochastic backpropagation rules for any distribution, discrete or continuous. Our approach exploits the link between the characteristic function and the Fourier transform, to transport the derivatives from the parameters of the distribution to the random variable. Our method generalizes previously known estimators, and results in new estimators for the gamma, beta, Dirichlet and Laplace distributions. Furthermore, we show that the classical deterministic backproapagation rule and the discrete random variable case, can also be interpreted through stochastic backpropagation.",
+    "title": "Fourier Stochastic Backpropagation",
+    "authors": [
+      "Amine Echraibi",
+      "Joachim Flocon Cholet",
+      "St\u00e9phane Gosselin",
+      "Sandrine Vaton"
+    ],
+    "emails": [
+      "orange.com",
+      "~Amine_Echraibi1",
+      "imt-atlantique.fr"
+    ],
+    "rank": 379
+  },
+  {
+    "url": "https://openreview.net/forum?id=YTWGvpFOQD-",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      2,
+      4,
+      3,
+      5
+    ],
+    "abstract": "We demonstrate that differentially private machine learning has not yet reached its ''AlexNet moment'' on many canonical vision tasks: linear models trained on handcrafted features significantly outperform end-to-end deep neural networks for moderate privacy budgets.\nTo exceed the performance of handcrafted features, we show that private learning requires either much more private data, or access to features learned on public data from a similar domain.\nOur work introduces simple yet strong baselines for differentially private learning that can inform the evaluation of future progress in this area.",
+    "title": "Differentially Private Learning Needs Better Features (or Much More Data)",
+    "authors": [
+      "Florian Tramer",
+      "Dan Boneh"
+    ],
+    "emails": [
+      "~Florian_Tramer1",
+      "~Dan_Boneh1"
+    ],
+    "rank": 380
+  },
+  {
+    "url": "https://openreview.net/forum?id=kmqjgSNXby",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Standard dynamics models for continuous control make use of feedforward computation to predict the conditional distribution of next state and reward given current state and action using a multivariate Gaussian with a diagonal covariance structure. This modeling choice assumes that different dimensions of the next state and reward are conditionally independent given the current state and action and may be driven by the fact that fully observable physics-based simulation environments entail deterministic transition dynamics. In this paper, we challenge this conditional independence assumption and propose a family of expressive autoregressive dynamics models that generate different dimensions of the next state and reward sequentially conditioned on previous dimensions. We demonstrate that autoregressive dynamics models indeed outperform standard feedforward models in log-likelihood on heldout transitions. Furthermore, we compare different model-based and model-free off-policy evaluation (OPE) methods on RL Unplugged, a suite of offline MuJoCo datasets, and find that autoregressive dynamics models consistently outperform all baselines, achieving a new state-of-the-art. Finally, we show that autoregressive dynamics models are useful for offline policy optimization by serving as a way to enrich the replay buffer through data augmentation and improving performance using model-based planning.\n",
+    "title": "Autoregressive Dynamics Models for Offline Policy Evaluation and Optimization",
+    "authors": [
+      "Michael R Zhang",
+      "Thomas Paine",
+      "Ofir Nachum",
+      "Cosmin Paduraru",
+      "George Tucker",
+      "ziyu wang",
+      "Mohammad Norouzi"
+    ],
+    "emails": [
+      "~George_Tucker1",
+      "~Thomas_Paine1",
+      "~Ofir_Nachum1",
+      "~Mohammad_Norouzi1",
+      "~Cosmin_Paduraru1",
+      "~Michael_R_Zhang1",
+      "~ziyu_wang1"
+    ],
+    "rank": 381
+  },
+  {
+    "url": "https://openreview.net/forum?id=CF-ZIuSMXRz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      9,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      2,
+      3,
+      5
+    ],
+    "abstract": "Although spatio-temporal graph neural networks have achieved great empirical success in handling multiple correlated time series, they may be impractical in some real-world scenarios due to a lack of sufficient high-quality training data. Furthermore, spatio-temporal graph neural networks lack theoretical interpretation. To address these issues, we put forth a novel mathematically designed framework to analyze spatio-temporal data. Our proposed spatio-temporal graph scattering transform (ST-GST) extends traditional scattering transform to the spatio-temporal domain. It performs iterative applications of spatio-temporal graph wavelets and  nonlinear activation functions, which can be viewed as a forward pass of spatio-temporal graph convolutional networks without training. Since all the filter coefficients in ST-GST are mathematically designed, it is promising for the real-world scenarios with limited training data, and also allows for a theoretical analysis, which shows that  the proposed ST-GST is stable to small perturbations of input signals and structures. Finally, our experiments show that i) ST-GST outperforms spatio-temporal graph convolutional networks by an increase of 35% in accuracy for MSR Action3D dataset; ii) it is  better and computationally more efficient to design the transform based on separable  spatio-temporal graphs than the joint ones; and iii) nonlinearity in ST-GST is critical to empirical performance.",
+    "title": "Spatio-Temporal Graph Scattering Transform",
+    "authors": [
+      "Chao Pan",
+      "Siheng Chen",
+      "Antonio Ortega"
+    ],
+    "emails": [
+      "~Antonio_Ortega1",
+      "~Chao_Pan2",
+      "~Siheng_Chen1"
+    ],
+    "rank": 382
+  },
+  {
+    "url": "https://openreview.net/forum?id=45NZvF1UHam",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Hamiltonian mechanics is an effective tool to represent many physical processes with concise yet well-generalized mathematical expressions. A well-modeled Hamiltonian makes it easy for researchers to analyze and forecast many related phenomena that are governed by the same physical law. However, in general, identifying a functional or shared expression of the Hamiltonian is very difficult. It requires carefully designed experiments and the researcher's insight that comes from years of experience. We propose that meta-learning algorithms can be potentially powerful data-driven tools for identifying the physical law governing Hamiltonian systems without any mathematical assumptions on the representation, but with observations from a set of systems governed by the same physical law. We show that a well meta-trained learner can identify the shared representation of the Hamiltonian by evaluating our method on several types of physical systems with various experimental settings.",
+    "title": "Identifying Physical Law of Hamiltonian Systems via Meta-Learning",
+    "authors": [
+      "Seungjun Lee",
+      "Haesang Yang",
+      "Woojae Seong"
+    ],
+    "emails": [
+      "~Woojae_Seong1",
+      "~Haesang_Yang1",
+      "~Seungjun_Lee1"
+    ],
+    "rank": 383
+  },
+  {
+    "url": "https://openreview.net/forum?id=tiqI7w64JG2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      8,
+      7
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "From the perspectives of expressive power and learning, this work compares multi-layer Graph Neural Networks (GNNs) with a simplified alternative that we call Graph-Augmented Multi-Layer Perceptrons (GA-MLPs), which first augments node features with certain multi-hop operators on the graph and then applies learnable node-wise functions. From the perspective of graph isomorphism testing, we show both theoretically and numerically that GA-MLPs with suitable operators can distinguish almost all non-isomorphic graphs, just like the Weisfeiler-Lehman (WL) test and GNNs. However, by viewing them as node-level functions and examining the equivalence classes they induce on rooted graphs, we prove a separation in expressive power between GA-MLPs and GNNs that grows exponentially in depth. In particular, unlike GNNs, GA-MLPs are unable to count the number of attributed walks. We also demonstrate via community detection experiments that GA-MLPs can be limited by their choice of operator family, whereas GNNs have higher flexibility in learning.",
+    "title": "On Graph Neural Networks versus Graph-Augmented MLPs",
+    "authors": [
+      "Lei Chen",
+      "Zhengdao Chen",
+      "Joan Bruna"
+    ],
+    "emails": [
+      "~Lei_Chen4",
+      "~Zhengdao_Chen1",
+      "~Joan_Bruna1"
+    ],
+    "rank": 384
+  },
+  {
+    "url": "https://openreview.net/forum?id=jN5y-zb5Q7m",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Uncertainty estimation is important for ensuring safety and robustness of AI systems.  While most research in the area has focused on un-structured prediction tasks, limited work has investigated general uncertainty estimation approaches for structured prediction. Thus, this work aims to investigate uncertainty estimation for structured prediction tasks within a single unified and interpretable probabilistic ensemble-based framework.  We consider: uncertainty estimation for sequence data at the token-level and complete sequence-level; interpretations for, and applications of, various measures of uncertainty; and discuss both the theoretical and practical challenges associated with obtaining them. This work also provides baselines for token-level and sequence-level error detection, and sequence-level out-of-domain input detection on the WMT\u201914 English-French and WMT\u201917 English-German translation and LibriSpeech speech recognition datasets.",
+    "title": "Uncertainty Estimation in Autoregressive Structured Prediction",
+    "authors": [
+      "Andrey Malinin",
+      "Mark Gales"
+    ],
+    "emails": [
+      "~Andrey_Malinin1",
+      "~Mark_Gales1"
+    ],
+    "rank": 385
+  },
+  {
+    "url": "https://openreview.net/forum?id=BbNIbVPJ-42",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      2,
+      2,
+      4
+    ],
+    "abstract": "Invariant Causal Prediction (Peters et al., 2016) is a technique for out-of-distribution generalization which assumes that some aspects of the data distribution vary across the training set but that the underlying causal mechanisms remain constant. Recently, Arjovsky et al. (2019) proposed Invariant Risk Minimization (IRM), an objective based on this idea for learning deep, invariant features of data which are a complex function of latent variables; many alternatives have subsequently been suggested.  However, formal guarantees for all of these works are severely lacking.  In this paper,  we present the first analysis of classification under the IRM objective\u2014as well as these recently proposed alternatives\u2014under a fairly natural and general model. In the linear case, we show simple conditions under which the optimal solution succeeds or, more often, fails to recover the optimal invariant predictor. We furthermore present the very first results in the non-linear regime: we demonstrate that IRM can fail catastrophically unless the test data is sufficiently similar to the training distribution\u2014this is precisely the issue that it was intended to solve. Thus, in this setting we find that IRM and its alternatives fundamentally do not improve over standard Empirical Risk Minimization.",
+    "title": "The Risks of Invariant Risk Minimization",
+    "authors": [
+      "Elan Rosenfeld",
+      "Pradeep Kumar Ravikumar",
+      "Andrej Risteski"
+    ],
+    "emails": [
+      "~Elan_Rosenfeld1",
+      "~Andrej_Risteski2",
+      "~Pradeep_Kumar_Ravikumar1"
+    ],
+    "rank": 386
+  },
+  {
+    "url": "https://openreview.net/forum?id=jDdzh5ul-d",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Federated learning (FL) is a distributed machine learning architecture that leverages a large number of workers to jointly learn a model with decentralized data. FL has received increasing attention in recent years thanks to its data privacy protection, communication efficiency and a linear speedup for convergence in training (i.e., convergence performance increases linearly with respect to the number of workers). However, existing studies on linear speedup for convergence are only limited to the assumptions of i.i.d. datasets across workers and/or full worker participation, both of which rarely hold in practice. So far, it remains an open question whether or not the linear speedup for convergence is achievable under non-i.i.d. datasets with partial worker participation in FL.  In this paper, we show that the answer is affirmative. Specifically, we show that the federated averaging (FedAvg) algorithm (with two-sided learning rates) on non-i.i.d. datasets in non-convex settings achieves a convergence rate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msqrt size=\"s\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.16em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mfrac space=\"3\"><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><msqrt><mi>m</mi><mi>K</mi><mi>T</mi></msqrt></mfrac><mo>+</mo><mfrac><mn>1</mn><mi>T</mi></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for full worker participation and a convergence rate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msqrt size=\"s\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.16em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mfrac space=\"3\"><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><msqrt><mi>n</mi><mi>K</mi><mi>T</mi></msqrt></mfrac><mo>+</mo><mfrac><mn>1</mn><mi>T</mi></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for partial worker participation, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> is the number of local steps, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the number of total communication rounds, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container> is the total worker number and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the worker number in one communication round if for partial worker participation. Our results also reveal that the local steps in FL could help the convergence and show that the maximum number of local steps can be improved to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi><mrow><mo>/</mo></mrow><mi>m</mi></math></mjx-assistive-mml></mjx-container>. We conduct extensive experiments on MNIST and CIFAR-10 to verify our theoretical results.",
+    "title": "Achieving Linear Speedup with Partial Worker Participation in Non-IID Federated Learning",
+    "authors": [
+      "Haibo Yang",
+      "Minghong Fang",
+      "Jia Liu"
+    ],
+    "emails": [
+      "~Haibo_Yang1",
+      "~Jia_Liu1",
+      "iastate.edu"
+    ],
+    "rank": 387
+  },
+  {
+    "url": "https://openreview.net/forum?id=9EKHN1jOlA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.64",
+    "confidences": [
+      2,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Uncertainty quantification is crucial for building reliable and trustable machine learning systems. We propose to estimate uncertainty in recurrent neural networks (RNNs) via stochastic discrete state transitions over recurrent timesteps. The uncertainty of the model can be quantified by running a prediction several times, each time sampling from the recurrent state transition distribution, leading to potentially different results if the model is uncertain. Alongside uncertainty quantification, our proposed method offers several advantages in different settings. The proposed method can (1) learn deterministic and probabilistic automata from data, (2) learn well-calibrated models on real-world classification tasks, (3) improve the performance of out-of-distribution detection, and (4) control the exploration-exploitation trade-off in reinforcement learning. An implementation is available.",
+    "title": "Uncertainty Estimation and Calibration with Finite-State Probabilistic RNNs",
+    "authors": [
+      "Cheng Wang",
+      "Carolin Lawrence",
+      "Mathias Niepert"
+    ],
+    "emails": [
+      "~Cheng_Wang9",
+      "~Carolin_Lawrence1",
+      "~Mathias_Niepert1"
+    ],
+    "rank": 388
+  },
+  {
+    "url": "https://openreview.net/forum?id=T1XmO8ScKim",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      1,
+      4,
+      3
+    ],
+    "abstract": "Continuous input signals like images and time series that are irregularly sampled or have missing values are challenging for existing deep learning methods. Coherently defined feature representations must depend on the values in unobserved regions of the input. Drawing from the work in probabilistic numerics, we propose Probabilistic Numeric Convolutional Neural Networks which represent features as Gaussian processes, providing a probabilistic description of discretization error. We then define a convolutional layer as the evolution of a PDE defined on this GP, followed by a nonlinearity. This approach also naturally admits steerable equivariant convolutions under e.g. the rotation group. In experiments we show that our approach yields a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> reduction of error from the previous state of the art on the SuperPixel-MNIST dataset and competitive performance on the medical time series dataset PhysioNet2012.",
+    "title": "Probabilistic Numeric Convolutional Neural Networks",
+    "authors": [
+      "Marc Anton Finzi",
+      "Roberto Bondesan",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Marc_Anton_Finzi1",
+      "~Roberto_Bondesan1",
+      "~Max_Welling1"
+    ],
+    "rank": 389
+  },
+  {
+    "url": "https://openreview.net/forum?id=TVjLza1t4hI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Despite extensive standardization, diagnostic interviews for mental health disorders encompass substantial subjective judgment. Previous studies have demonstrated that EEG-based neural measures can function as reliable objective correlates of depression, or even predictors of depression and its course. However, their clinical utility has not been fully realized because of 1) the lack of automated ways to deal with the inherent noise associated with EEG data at scale, and 2) the lack of knowledge of which aspects of the EEG signal may be markers of a clinical disorder. Here we adapt an unsupervised pipeline from the recent deep representation learning literature to address these problems by 1) learning a disentangled representation using <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE to denoise the signal, and 2) extracting interpretable features associated with a sparse set of clinical labels using a Symbol-Concept Association Network (SCAN). We demonstrate that our method is able to outperform the canonical hand-engineered baseline classification method on a number of factors, including participant age and depression diagnosis. Furthermore, our method recovers a representation that can be used to automatically extract denoised Event Related Potentials (ERPs) from novel, single EEG trajectories, and supports fast supervised re-mapping to various clinical labels, allowing clinicians to re-use a single EEG representation regardless of updates to the standardized diagnostic system. Finally, single factors of the learned disentangled representations often correspond to meaningful markers of clinical factors, as automatically detected by SCAN, allowing for human interpretability and post-hoc expert analysis of the recommendations made by the model.",
+    "title": "Representation learning for improved interpretability and classification accuracy of clinical factors from EEG",
+    "authors": [
+      "Garrett Honke",
+      "Irina Higgins",
+      "Nina Thigpen",
+      "Vladimir Miskovic",
+      "Katie Link",
+      "Sunny Duan",
+      "Pramod Gupta",
+      "Julia Klawohn",
+      "Greg Hajcak"
+    ],
+    "emails": [
+      "~Vladimir_Miskovic1",
+      "~Greg_Hajcak1",
+      "google.com",
+      "~Irina_Higgins1",
+      "hu-berlin.de"
+    ],
+    "rank": 390
+  },
+  {
+    "url": "https://openreview.net/forum?id=AHm3dbp7D1D",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "This paper is concerned with self-supervised learning for small models. The problem is motivated by our empirical studies that while the widely used contrastive self-supervised learning method has shown great progress on large model training, it does not work well for small models. To address this problem, we propose a new learning paradigm, named <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D412 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D404 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">SE</mtext></math></mjx-assistive-mml></mjx-container>lf-Sup<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D404 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">E</mtext></math></mjx-assistive-mml></mjx-container>rvised <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D403 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">D</mtext></math></mjx-assistive-mml></mjx-container>istillation (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mstyle size=\"lg\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi></mjx-mstyle></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mstyle mathsize=\"1.2em\"><mi>S</mi></mstyle></mrow></math></mjx-assistive-mml></mjx-container>EED), where we leverage a larger network (as Teacher) to transfer its representational knowledge into a smaller architecture (as Student) in a self-supervised fashion. Instead of directly learning from unlabeled data, we train a student encoder to mimic the similarity score distribution inferred by a teacher over a set of instances. We show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mstyle size=\"lg\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi></mjx-mstyle></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mstyle mathsize=\"1.2em\"><mi>S</mi></mstyle></mrow></math></mjx-assistive-mml></mjx-container>EED dramatically boosts the performance of small networks on downstream tasks. Compared with self-supervised baselines, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mstyle size=\"lg\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi></mjx-mstyle></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mstyle mathsize=\"1.2em\"><mi>S</mi></mstyle></mrow></math></mjx-assistive-mml></mjx-container>EED improves the top-1 accuracy from 42.2% to 67.6% on EfficientNet-B0 and from 36.3% to 68.2% on MobileNet-v3-Large on the ImageNet-1k dataset. ",
+    "title": "SEED: Self-supervised Distillation For Visual Representation",
+    "authors": [
+      "Zhiyuan Fang",
+      "Jianfeng Wang",
+      "Lijuan Wang",
+      "Lei Zhang",
+      "Yezhou Yang",
+      "Zicheng Liu"
+    ],
+    "emails": [
+      "~Zhiyuan_Fang1",
+      "~Yezhou_Yang1",
+      "~Jianfeng_Wang4",
+      "~Lijuan_Wang1",
+      "~Lei_Zhang23",
+      "~Zicheng_Liu1"
+    ],
+    "rank": 391
+  },
+  {
+    "url": "https://openreview.net/forum?id=tqOvYpjPax2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      7,
+      6
+    ],
+    "rating": "6.64",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Several works have shown that the regularization mechanisms underlying deep neural networks' generalization performances are still poorly understood. In this paper, we hypothesize that deep neural networks are regularized through their ability to extract meaningful clusters among the samples of a class. This constitutes an implicit form of regularization, as no explicit training mechanisms or supervision target such behaviour. To support our hypothesis, we design four different measures of intraclass clustering, based on the neuron- and layer-level representations of the training data. We then show that these measures constitute accurate predictors of generalization performance across variations of a large set of hyperparameters (learning rate, batch size, optimizer, weight decay, dropout rate, data augmentation, network depth and width).",
+    "title": "Intraclass clustering: an implicit learning ability that regularizes DNNs",
+    "authors": [
+      "Simon Carbonnelle",
+      "Christophe De Vleeschouwer"
+    ],
+    "emails": [
+      "~Simon_Carbonnelle1",
+      "~Christophe_De_Vleeschouwer1"
+    ],
+    "rank": 392
+  },
+  {
+    "url": "https://openreview.net/forum?id=_X_4Akcd8Re",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.62",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Learning long-term dynamics models is the key to understanding physical common sense. Most existing approaches on learning dynamics from visual input sidestep long-term predictions by resorting to rapid re-planning with short-term models. This not only requires such models to be super accurate but also limits them only to tasks where an agent can continuously obtain feedback and take action at each step until completion. In this paper, we aim to leverage the ideas from success stories in visual recognition tasks to build object representations that can capture inter-object and object-environment interactions over a long range. To this end, we propose Region Proposal Interaction Networks (RPIN), which reason about each object's trajectory in a latent region-proposal feature space. Thanks to the simple yet effective object representation, our approach outperforms prior methods by a significant margin both in terms of prediction quality and their ability to plan for downstream tasks, and also generalize well to novel environments. Code, pre-trained models, and more visualization results are available at <a href=\"https://haozhi.io/RPIN\" target=\"_blank\" rel=\"nofollow\">https://haozhi.io/RPIN</a>.",
+    "title": "Learning Long-term Visual Dynamics with Region Proposal Interaction Networks",
+    "authors": [
+      "Haozhi Qi",
+      "Xiaolong Wang",
+      "Deepak Pathak",
+      "Yi Ma",
+      "Jitendra Malik"
+    ],
+    "emails": [
+      "~Haozhi_Qi1",
+      "~Jitendra_Malik2",
+      "~Deepak_Pathak1",
+      "~Xiaolong_Wang3",
+      "~Yi_Ma4"
+    ],
+    "rank": 393
+  },
+  {
+    "url": "https://openreview.net/forum?id=K5YasWXZT3O",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.62",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Empirical risk minimization (ERM) is typically designed to perform well on the average loss, which can result in estimators that are sensitive to outliers, generalize poorly, or treat subgroups unfairly. While many methods aim to address these problems individually, in this work, we explore them through a unified framework---tilted empirical risk minimization (TERM). In particular, we show that it is possible to flexibly tune the impact of individual losses through a straightforward extension to ERM using a hyperparameter called the tilt. We provide several interpretations of the resulting framework: We show that TERM can increase or decrease the influence of outliers, respectively, to enable fairness or robustness; has variance-reduction properties that can benefit generalization; and can be viewed as a smooth approximation to a superquantile method. We develop batch and stochastic first-order optimization methods for solving TERM, and show that the problem can be efficiently solved relative to common alternatives. Finally, we demonstrate that TERM can be used for a multitude of applications, such as enforcing fairness between subgroups, mitigating the effect of outliers, and handling class imbalance. TERM is not only competitive with existing solutions tailored to these individual problems, but can also enable entirely new applications, such as simultaneously addressing outliers and promoting fairness.",
+    "title": "Tilted Empirical Risk Minimization",
+    "authors": [
+      "Tian Li",
+      "Ahmad Beirami",
+      "Maziar Sanjabi",
+      "Virginia Smith"
+    ],
+    "emails": [
+      "~Maziar_Sanjabi1",
+      "~Tian_Li1",
+      "~Virginia_Smith1",
+      "~Ahmad_Beirami1"
+    ],
+    "rank": 394
+  },
+  {
+    "url": "https://openreview.net/forum?id=RmB-88r9dL",
+    "decision": "Accept (Oral)",
+    "ratings": [
+      5,
+      6,
+      9
+    ],
+    "rating": "6.62",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Motivated by the rising abundance of observational data with continuous treatments, we investigate the problem of estimating the average dose-response curve (ADRF). Available parametric methods are limited in their model space, and previous attempts in leveraging neural network to enhance model expressiveness relied on partitioning continuous treatment into blocks and using separate heads for each block; this however produces in practice discontinuous ADRFs. Therefore, the question of how to adapt the structure and training of neural network to estimate ADRFs remains open. This paper makes two important contributions. First, we propose a novel varying coefficient neural network (VCNet) that improves model expressiveness while preserving continuity of the estimated ADRF. Second, to improve finite sample performance, we generalize targeted regularization to obtain a doubly robust estimator of the whole ADRF curve.",
+    "title": "VCNet and Functional Targeted Regularization For Learning Causal Effects of Continuous Treatments",
+    "authors": [
+      "Lizhen Nie",
+      "Mao Ye",
+      "qiang liu",
+      "Dan Nicolae"
+    ],
+    "emails": [
+      "uchicago.edu",
+      "~qiang_liu4",
+      "~Mao_Ye11",
+      "galton.uchicago.edu"
+    ],
+    "rank": 395
+  },
+  {
+    "url": "https://openreview.net/forum?id=snaT4xewUfX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.62",
+    "confidences": [
+      3,
+      2,
+      3
+    ],
+    "abstract": "This paper proposes a stochastic variational inference (SVI) method for computing an approximate posterior path measure of a Cox process. These processes are widely used in natural and physical sciences, engineering and operations research, and represent a non-trivial model of a wide array of phenomena. In our work, we model the stochastic intensity as the solution of a diffusion stochastic differential equation (SDE), and our objective is to infer the posterior, or smoothing, measure over the paths given Poisson process realizations. We first derive a system of stochastic partial differential equations (SPDE) for the pathwise smoothing posterior density function, a non-trivial result, since the standard solution of SPDEs typically involves an It\\^o stochastic integral, which is not defined pathwise. Next, we propose an SVI approach to approximating the solution of the system. We parametrize the class of approximate smoothing posteriors using a neural network, derive a lower bound on the evidence of the observed point process sample-path, and optimize the lower bound using stochastic gradient descent (SGD). We demonstrate the efficacy of our method on both synthetic and real-world problems, and demonstrate the advantage of the neural network solution over standard numerical solvers.",
+    "title": "Variational inference for diffusion modulated Cox processes",
+    "authors": [
+      "Prateek Jaiswal",
+      "Harsha Honnappa",
+      "Vinayak Rao"
+    ],
+    "emails": [
+      "~Vinayak_Rao1",
+      "~Harsha_Honnappa1",
+      "~Prateek_Jaiswal1"
+    ],
+    "rank": 396
+  },
+  {
+    "url": "https://openreview.net/forum?id=TtYSU29zgR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      6,
+      6
+    ],
+    "rating": "6.62",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Imitation Learning (IL) methods seek to match the behavior of an agent with that of an expert. In the present work, we propose a new IL method based on a conceptually simple algorithm: Primal Wasserstein Imitation Learning (PWIL), which ties to the primal form of the Wasserstein distance between the expert and the agent state-action distributions. We present a reward function which is derived offline, as opposed to recent adversarial IL algorithms that learn a reward function through interactions with the environment, and which requires little fine-tuning. We show that we can recover expert behavior on a variety of continuous control tasks of the MuJoCo domain in a sample efficient manner in terms of agent interactions and of expert interactions with the environment. Finally, we show that the behavior of the agent we train matches the behavior of the expert with the Wasserstein distance, rather than the commonly used proxy of performance.",
+    "title": "Primal Wasserstein Imitation Learning",
+    "authors": [
+      "Robert Dadashi",
+      "Leonard Hussenot",
+      "Matthieu Geist",
+      "Olivier Pietquin"
+    ],
+    "emails": [
+      "~Leonard_Hussenot1",
+      "~Matthieu_Geist1",
+      "~Robert_Dadashi2",
+      "~Olivier_Pietquin1"
+    ],
+    "rank": 397
+  },
+  {
+    "url": "https://openreview.net/forum?id=7aogOj_VYO0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      9,
+      5
+    ],
+    "rating": "6.62",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "The privacy leakage of the model about the training data can be bounded in the differential privacy mechanism. However, for meaningful privacy parameters, a differentially private model degrades the utility drastically when the model comprises a large number of trainable parameters.  In this paper, we propose an algorithm  \\emph{Gradient Embedding Perturbation (GEP)} towards training differentially private deep models with decent accuracy. Specifically, in each gradient descent step, GEP first projects individual private gradient into a non-sensitive anchor subspace, producing a low-dimensional gradient embedding and a small-norm residual gradient. Then, GEP perturbs the low-dimensional embedding and the residual gradient separately according to the privacy budget. Such a decomposition permits a small perturbation variance, which greatly helps to break the dimensional barrier of private learning. With GEP, we achieve decent accuracy with low computational cost and modest privacy guarantee for deep models.  Especially, with privacy bound <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi><mo>=</mo><mn>8</mn></math></mjx-assistive-mml></mjx-container>, we achieve <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>74.9</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> test accuracy on CIFAR10 and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>95.1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> test accuracy on  SVHN, significantly improving over existing results.",
+    "title": "Do not Let Privacy Overbill Utility:  Gradient Embedding Perturbation for Private Learning",
+    "authors": [
+      "Da Yu",
+      "Huishuai Zhang",
+      "Wei Chen",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Wei_Chen1",
+      "~Tie-Yan_Liu1",
+      "~Da_Yu1",
+      "~Huishuai_Zhang3"
+    ],
+    "rank": 398
+  },
+  {
+    "url": "https://openreview.net/forum?id=JbuYF437WB6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.60",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Graph-structured data ubiquitously appears in science and engineering. Graph neural networks (GNNs) are designed to exploit the relational inductive bias exhibited in graphs; they have been shown to outperform other forms of neural networks in scenarios where structure information supplements node features. The most common GNN architecture aggregates information from neighborhoods based on message passing. Its generality has made it broadly applicable. In this paper, we focus on a special, yet widely used, type of graphs---DAGs---and inject a stronger inductive bias---partial ordering---into the neural network design. We propose the directed acyclic graph neural network, DAGNN, an architecture that processes information according to the flow defined by the partial order. DAGNN can be considered a framework that entails earlier works as special cases (e.g., models for trees and models updating node representations recurrently), but we identify several crucial components that prior architectures lack. We perform comprehensive experiments, including ablation studies, on representative DAG datasets (i.e., source code, neural architectures, and probabilistic graphical models) and demonstrate the superiority of DAGNN over simpler DAG architectures as well as general graph architectures.",
+    "title": "Directed Acyclic Graph Neural Networks",
+    "authors": [
+      "Veronika Thost",
+      "Jie Chen"
+    ],
+    "emails": [
+      "~Veronika_Thost1",
+      "~Jie_Chen1"
+    ],
+    "rank": 399
+  },
+  {
+    "url": "https://openreview.net/forum?id=Lc28QAB4ypz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.60",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Decomposing knowledge into interchangeable pieces promises a generalization advantage when there are changes in distribution. A learning agent interacting with its environment is likely to be faced with situations requiring novel combinations of existing pieces of knowledge. We hypothesize that such a decomposition of knowledge is particularly relevant for being able to generalize in a systematic way to out-of-distribution changes. To study these ideas, we propose a particular training framework in which we assume that the pieces of knowledge an agent needs and its reward function are stationary and can be re-used across tasks. An attention mechanism dynamically selects which modules can be adapted to the current task, and the parameters of the \\textit{selected} modules are allowed to change quickly as the learner is confronted with variations in what it experiences, while the parameters of the attention mechanisms act as stable, slowly changing, meta-parameters. We focus on pieces of knowledge captured by an ensemble of  modules sparsely communicating with each other via a bottleneck of attention. We find that meta-learning the  modular aspects of the proposed system greatly helps in achieving faster adaptation in a reinforcement learning setup involving navigation in a partially observed grid world with image-level input.  We also find that reversing the role of parameters and meta-parameters does not work nearly as well, suggesting a particular role for fast adaptation of the dynamically selected modules.",
+    "title": "Fast And Slow Learning Of Recurrent Independent Mechanisms",
+    "authors": [
+      "Kanika Madan",
+      "Nan Rosemary Ke",
+      "Anirudh Goyal",
+      "Bernhard Sch\u00f6lkopf",
+      "Yoshua Bengio"
+    ],
+    "emails": [
+      "~Kanika_Madan3",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Yoshua_Bengio1",
+      "~Anirudh_Goyal1",
+      "~Nan_Rosemary_Ke1"
+    ],
+    "rank": 400
+  },
+  {
+    "url": "https://openreview.net/forum?id=ee6W5UgQLa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.60",
+    "confidences": [
+      3,
+      3,
+      3,
+      1
+    ],
+    "abstract": "When answering complex questions, people can seamlessly combine information from visual, textual and tabular sources. \nWhile interest in models that reason over multiple pieces of evidence has surged in recent years, there has been relatively little work on question answering models that reason across multiple modalities.\nIn this paper, we present MultiModalQA (MMQA): a challenging question answering dataset that requires joint reasoning over text, tables and images. \nWe create MMQA using a new framework for generating complex multi-modal questions at scale, harvesting tables from Wikipedia, and attaching images and text paragraphs using entities that appear in each table. We then define a formal language that allows us to take questions that can be answered from a single modality, and combine them to generate cross-modal questions. Last, crowdsourcing workers take these automatically generated questions and rephrase them into more fluent language.\nWe create 29,918 questions through this procedure, and empirically demonstrate the necessity of a multi-modal multi-hop approach to solve our task: our multi-hop model, ImplicitDecomp, achieves an average F1 of 51.7  over cross-modal questions, substantially outperforming a strong baseline that achieves 38.2 F1, but still lags significantly behind human performance, which is at 90.1 F1.",
+    "title": "MultiModalQA: complex question answering over text, tables and images",
+    "authors": [
+      "Alon Talmor",
+      "Ori Yoran",
+      "Amnon Catav",
+      "Dan Lahav",
+      "Yizhong Wang",
+      "Akari Asai",
+      "Gabriel Ilharco",
+      "Hannaneh Hajishirzi",
+      "Jonathan Berant"
+    ],
+    "emails": [
+      "~Alon_Talmor1",
+      "~Jonathan_Berant1",
+      "~Gabriel_Ilharco1",
+      "mail.tau.ac.il",
+      "~Yizhong_Wang2",
+      "~Akari_Asai2",
+      "~Hannaneh_Hajishirzi1"
+    ],
+    "rank": 401
+  },
+  {
+    "url": "https://openreview.net/forum?id=YWtLZvLmud7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.60",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transformer architectures have proven to learn useful representations for protein classification and generation tasks. However, these representations present challenges in interpretability. In this work, we demonstrate a set of methods for analyzing protein Transformer models through the lens of attention. We show that attention: (1) captures the folding structure of proteins, connecting amino acids that are far apart in the underlying sequence, but spatially close in the three-dimensional structure, (2) targets binding sites, a key functional component of proteins, and (3) focuses on progressively more complex biophysical properties with increasing layer depth. We find this behavior to be consistent across three Transformer architectures (BERT, ALBERT, XLNet) and two distinct protein datasets. We also present a three-dimensional visualization of the interaction between attention and protein structure. Code for visualization and analysis is available at <a href=\"https://github.com/salesforce/provis\" target=\"_blank\" rel=\"nofollow\">https://github.com/salesforce/provis</a>.",
+    "title": "BERTology Meets Biology: Interpreting Attention in Protein Language Models",
+    "authors": [
+      "Jesse Vig",
+      "Ali Madani",
+      "Lav R. Varshney",
+      "Caiming Xiong",
+      "richard socher",
+      "Nazneen Rajani"
+    ],
+    "emails": [
+      "~Jesse_Vig1",
+      "~Nazneen_Rajani1",
+      "berkeley.edu",
+      "~Lav_R._Varshney1",
+      "~richard_socher1",
+      "~Caiming_Xiong1"
+    ],
+    "rank": 402
+  },
+  {
+    "url": "https://openreview.net/forum?id=l35SB-_raSQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      8,
+      6
+    ],
+    "rating": "6.60",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "We consider a regression problem, where the correspondence between the input and output data is not available. Such shuffled data are commonly observed in many real world problems. Take flow cytometry as an example: the measuring instruments are unable to preserve the correspondence between the samples and the measurements. Due to the combinatorial nature of the problem, most of the existing methods are only applicable when the sample size is small, and are limited to linear regression models. To overcome such bottlenecks, we propose a new computational framework --- ROBOT --- for the shuffled regression problem, which is applicable to large data and complex models. Specifically, we propose to formulate regression without correspondence as a continuous optimization problem. Then by exploiting the interaction between the regression model and the data correspondence, we propose to develop a hypergradient approach based on differentiable programming techniques. Such a hypergradient approach essentially views the data correspondence as an operator of the regression model, and therefore it allows us to find a better descent direction for the model parameters by differentiating through the data correspondence. ROBOT is quite general, and can be further extended to an inexact correspondence setting, where the input and output data are not necessarily exactly aligned. Thorough numerical experiments show that ROBOT achieves better performance than existing methods in both linear and nonlinear regression tasks, including real-world applications such as flow cytometry and multi-object tracking.  ",
+    "title": "A Hypergradient Approach to Robust Regression without Correspondence",
+    "authors": [
+      "Yujia Xie",
+      "Yixiu Mao",
+      "Simiao Zuo",
+      "Hongteng Xu",
+      "Xiaojing Ye",
+      "Tuo Zhao",
+      "Hongyuan Zha"
+    ],
+    "emails": [
+      "~Xiaojing_Ye1",
+      "~Hongteng_Xu1",
+      "~Yujia_Xie1",
+      "gmail.com",
+      "~Tuo_Zhao1",
+      "~Hongyuan_Zha1",
+      "~Simiao_Zuo1"
+    ],
+    "rank": 403
+  },
+  {
+    "url": "https://openreview.net/forum?id=VVdmjgu7pKM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      8,
+      7
+    ],
+    "rating": "6.60",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Modeling a structured, dynamic environment like a video game requires keeping track of the objects and their states (declarative knowledge) as well as predicting how objects behave (procedural knowledge). Black-box models with a monolithic hidden state often fail to apply procedural knowledge consistently and uniformly, i.e., they lack systematicity. For example, in a video game, correct prediction of one enemy's trajectory does not ensure correct prediction of another's. We address this issue via an architecture that factorizes declarative and procedural knowledge and that imposes modularity within each form of knowledge. The architecture consists of active modules called object files that maintain the state of a single object and invoke passive external knowledge sources called schemata that prescribe state updates. To use a video game as an illustration, two enemies of the same type will share schemata but will have separate object files to encode their distinct state (e.g., health, position). We propose to use attention to determine which object files to update, the selection of schemata, and the propagation of information between object files. The resulting architecture is a drop-in replacement conforming to the same input-output interface as normal recurrent networks (e.g., LSTM, GRU) yet achieves substantially better generalization on environments that have multiple object tokens of the same type, including a challenging intuitive physics benchmark.\n",
+    "title": "Factorizing Declarative and Procedural Knowledge in Structured, Dynamical Environments",
+    "authors": [
+      "Anirudh Goyal",
+      "Alex Lamb",
+      "Phanideep Gampa",
+      "Philippe Beaudoin",
+      "Charles Blundell",
+      "Sergey Levine",
+      "Yoshua Bengio",
+      "Michael Curtis Mozer"
+    ],
+    "emails": [
+      "itbhu.ac.in",
+      "~Yoshua_Bengio1",
+      "~Charles_Blundell1",
+      "~Michael_Curtis_Mozer1",
+      "~Anirudh_Goyal1",
+      "~Sergey_Levine1",
+      "~Philippe_Beaudoin1",
+      "~Alex_Lamb1"
+    ],
+    "rank": 404
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZsZM-4iMQkH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6
+    ],
+    "rating": "6.60",
+    "confidences": [
+      4,
+      2,
+      4
+    ],
+    "abstract": "We study the implicit bias of gradient flow (i.e., gradient descent with infinitesimal step size) on linear neural network training. We propose a tensor formulation of neural networks that includes fully-connected, diagonal, and convolutional networks as special cases, and investigate the linear version of the formulation called linear tensor networks. With this formulation, we can characterize the convergence direction of the network parameters as singular vectors of a tensor defined by the network. For <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container>-layer linear tensor networks that are orthogonally decomposable, we show that gradient flow on separable classification finds a stationary point of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.177em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mn>2</mn><mrow><mo>/</mo></mrow><mi>L</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> max-margin problem in a \"transformed\" input space defined by the network. For underdetermined regression, we prove that gradient flow finds a global minimum which minimizes a norm-like function that interpolates between weighted <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> norms in the transformed input space. Our theorems subsume existing results in the literature while removing standard convergence assumptions. We also provide experiments that corroborate our analysis.",
+    "title": "A unifying view on implicit bias in training linear neural networks",
+    "authors": [
+      "Chulhee Yun",
+      "Shankar Krishnan",
+      "Hossein Mobahi"
+    ],
+    "emails": [
+      "~Shankar_Krishnan1",
+      "~Chulhee_Yun1",
+      "~Hossein_Mobahi2"
+    ],
+    "rank": 405
+  },
+  {
+    "url": "https://openreview.net/forum?id=OHgnfSrn2jv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      6
+    ],
+    "rating": "6.60",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "A novel optimization approach is proposed for application to policy gradient methods and evolution strategies for reinforcement learning (RL). The procedure uses a computationally efficient \\emph{Wasserstein natural gradient} (WNG) descent that takes advantage of the geometry induced by a Wasserstein penalty to speed optimization. This method follows the recent theme in RL of including divergence penalties in the objective to establish trust regions. Experiments on challenging tasks demonstrate improvements in both computational cost and performance over advanced baselines. \n",
+    "title": "Efficient Wasserstein Natural Gradients for Reinforcement Learning",
+    "authors": [
+      "Ted Moskovitz",
+      "Michael Arbel",
+      "Ferenc Huszar",
+      "Arthur Gretton"
+    ],
+    "emails": [
+      "~Ferenc_Huszar1",
+      "~Arthur_Gretton1",
+      "~Michael_Arbel1",
+      "~Ted_Moskovitz1"
+    ],
+    "rank": 406
+  },
+  {
+    "url": "https://openreview.net/forum?id=CHLhSw9pSw8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      3,
+      6,
+      7
+    ],
+    "rating": "6.60",
+    "confidences": [
+      3,
+      1,
+      3,
+      3
+    ],
+    "abstract": "Quantum Computing based Machine Learning mainly focuses on quantum computing hardware that is experimentally challenging to realize due to requiring quantum gates that operate at very low temperature. We demonstrate the existence of a \"quantum computing toy model\" that illustrates key aspects of quantum information processing while being experimentally accessible with room temperature optics. Pondering the question of the theoretical classification accuracy performance limit for MNIST (respectively \"Fashion-MNIST\") classifiers, subject to the constraint that a decision has to be made after detection of the very first photon that passed through an image-filter, we show that a machine learning system that is permitted to use quantum interference on the photon's state can substantially outperform any machine learning system that can not.  Specifically, we prove that a \"classical\" MNIST (respectively \"Fashion-MNIST\") classifier cannot achieve an accuracy of better than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>21.28</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> (respectively <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>18.28</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for \"Fashion-MNIST\") if it must make a decision after seeing a single photon falling on one of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>28</mn><mo>\u00d7</mo><mn>28</mn></math></mjx-assistive-mml></mjx-container> image pixels of a detector array.  We further demonstrate that a classifier that is permitted to employ quantum interference by optically transforming the photon state prior to detection can achieve a classification accuracy of at least <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>41.27</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for MNIST (respectively <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>36.14</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for \"Fashion-MNIST\"). We show in detail how to train the corresponding quantum state transformation with TensorFlow and also explain how this example can serve as a teaching tool for the measurement process in quantum mechanics.\n",
+    "title": "Single-Photon Image Classification",
+    "authors": [
+      "Thomas Fischbacher",
+      "Luciano Sbaiz"
+    ],
+    "emails": [
+      "~Thomas_Fischbacher1",
+      "~Luciano_Sbaiz1"
+    ],
+    "rank": 407
+  },
+  {
+    "url": "https://openreview.net/forum?id=dyaIRud1zXg",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      7
+    ],
+    "rating": "6.60",
+    "confidences": [
+      4,
+      4,
+      2
+    ],
+    "abstract": "In this work, we propose information laundering, a novel framework for enhancing model privacy. Unlike data privacy that concerns the protection of raw data information, model privacy aims to protect an already-learned model that is to be deployed for public use. The private model can be obtained from general learning methods, and its deployment means that it will return a deterministic or random response for a given input query. An information-laundered model consists of probabilistic components that deliberately maneuver the intended input and output for queries of the model, so the model's adversarial acquisition is less likely. Under the proposed framework, we develop an information-theoretic principle to quantify the fundamental tradeoffs between model utility and privacy leakage and derive the optimal design.",
+    "title": "Information Laundering for Model Privacy",
+    "authors": [
+      "Xinran Wang",
+      "Yu Xiang",
+      "Jun Gao",
+      "Jie Ding"
+    ],
+    "emails": [
+      "utah.edu",
+      "umn.edu",
+      "gmail.com",
+      "~Jie_Ding2"
+    ],
+    "rank": 408
+  },
+  {
+    "url": "https://openreview.net/forum?id=jM76BCb6F9m",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      4
+    ],
+    "rating": "6.59",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Mel-filterbanks are fixed, engineered audio features which emulate human perception and have been used through the history of audio understanding up to today. However, their undeniable qualities are counterbalanced by the fundamental limitations of handmade representations. In this work we show that we can train a single learnable frontend that outperforms mel-filterbanks on a wide range of audio signals, including speech, music, audio events and animal sounds, providing a general-purpose learned frontend for audio classification. To do so, we introduce a new principled, lightweight, fully learnable architecture that can be used as a drop-in replacement of mel-filterbanks. Our system learns all operations of audio features extraction, from filtering to pooling, compression and normalization, and can be integrated into any neural network at a negligible parameter cost. We perform multi-task training on eight diverse audio classification tasks, and show consistent improvements of our model over mel-filterbanks and previous learnable alternatives. Moreover, our system outperforms the current state-of-the-art learnable frontend on Audioset, with orders of magnitude fewer parameters.",
+    "title": "LEAF: A Learnable Frontend for Audio Classification",
+    "authors": [
+      "Neil Zeghidour",
+      "Olivier Teboul",
+      "F\u00e9lix de Chaumont Quitry",
+      "Marco Tagliasacchi"
+    ],
+    "emails": [
+      "~Neil_Zeghidour1",
+      "~F\u00e9lix_de_Chaumont_Quitry1",
+      "~Olivier_Teboul2",
+      "~Marco_Tagliasacchi3"
+    ],
+    "rank": 409
+  },
+  {
+    "url": "https://openreview.net/forum?id=qbH974jKUVy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      8
+    ],
+    "rating": "6.59",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Combinatorial generalisation \u2014 the ability to understand and produce novel combinations of familiar elements \u2014 is a core capacity of human intelligence that current AI systems struggle with. Recently, it has been suggested that learning disentangled representations may help address this problem. It is claimed that such representations should be able to capture the compositional structure of the world which can then be combined to support combinatorial generalisation. In this study, we systematically tested how the degree of disentanglement affects various forms of generalisation, including two forms of combinatorial generalisation that varied in difficulty. We trained three classes of variational autoencoders (VAEs) on two datasets on an unsupervised task by excluding combinations of generative factors during training. At test time we ask the models to reconstruct the missing combinations in order to measure generalisation performance. Irrespective of the degree of disentanglement, we found that the models supported only weak combinatorial generalisation. We obtained the same outcome when we directly input perfectly disentangled representations as the latents, and when we tested a model on a more complex task that explicitly required independent generative factors to be controlled. While learning disentangled representations does improve interpretability and sample efficiency in some downstream tasks, our results suggest that they are not sufficient for supporting more difficult forms of generalisation.",
+    "title": "The role of Disentanglement in Generalisation",
+    "authors": [
+      "Milton Llera Montero",
+      "Casimir JH Ludwig",
+      "Rui Ponte Costa",
+      "Gaurav Malhotra",
+      "Jeffrey Bowers"
+    ],
+    "emails": [
+      "~Gaurav_Malhotra1",
+      "~Jeffrey_Bowers1",
+      "~Milton_Llera_Montero1",
+      "~Rui_Ponte_Costa3",
+      "bristol.ac.uk"
+    ],
+    "rank": 410
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ig-VyQc-MLK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      4,
+      9
+    ],
+    "rating": "6.59",
+    "confidences": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Recent work has explored the possibility of pruning neural networks at initialization. We assess proposals for doing so: SNIP (Lee et al., 2019), GraSP (Wang et al., 2020), SynFlow (Tanaka et al., 2020), and magnitude pruning. Although these methods surpass the trivial baseline of random pruning, they remain below the accuracy of magnitude pruning after training, and we endeavor to understand why. We show that, unlike pruning after training, randomly shuffling the weights these methods prune within each layer or sampling new initial values preserves or improves accuracy. As such, the per-weight pruning decisions made by these methods can be replaced by a per-layer choice of the fraction of weights to prune. This property suggests broader challenges with the underlying pruning heuristics, the desire to prune at initialization, or both.",
+    "title": "Pruning Neural Networks at Initialization: Why Are We Missing the Mark?",
+    "authors": [
+      "Jonathan Frankle",
+      "Gintare Karolina Dziugaite",
+      "Daniel Roy",
+      "Michael Carbin"
+    ],
+    "emails": [
+      "~Jonathan_Frankle1",
+      "~Daniel_Roy1",
+      "~Gintare_Karolina_Dziugaite1",
+      "~Michael_Carbin1"
+    ],
+    "rank": 411
+  },
+  {
+    "url": "https://openreview.net/forum?id=vyY0jnWG-tK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.58",
+    "confidences": [
+      3,
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Given (small amounts of) time-series' data from  a high-dimensional, fine-grained, multiscale dynamical system, we propose a generative framework for learning an effective, lower-dimensional, coarse-grained dynamical model that is predictive of the fine-grained system's long-term evolution but also of its behavior under different initial conditions.\nWe target fine-grained models as they arise in physical applications (e.g. molecular dynamics, agent-based models), the dynamics  of which are strongly non-stationary but their transition to equilibrium is governed by unknown slow processes which are largely inaccessible by brute-force simulations.\nApproaches based on domain knowledge heavily rely on physical insight in identifying temporally slow features and fail to enforce the long-term stability of the learned dynamics. On the other hand, purely statistical frameworks lack interpretability and rely on large amounts of expensive simulation data (long and multiple trajectories) as they cannot infuse domain knowledge. \nThe generative framework proposed achieves  the aforementioned desiderata by  employing a flexible prior on the complex plane for the latent, slow processes, and  an intermediate layer of physics-motivated latent variables that reduces reliance on data and imbues inductive bias. In contrast to existing schemes, it does not require  the a priori definition of projection operators from the fine-grained description and addresses simultaneously the tasks of dimensionality reduction and model estimation.\nWe demonstrate its efficacy and accuracy in multiscale physical systems of particle dynamics where probabilistic, long-term predictions of phenomena not contained in the training data are produced.",
+    "title": "Physics-aware, probabilistic model order reduction with guaranteed stability",
+    "authors": [
+      "Sebastian Kaltenbach",
+      "Phaedon Stelios Koutsourelakis"
+    ],
+    "emails": [
+      "~Sebastian_Kaltenbach1",
+      "~Phaedon_Stelios_Koutsourelakis1"
+    ],
+    "rank": 412
+  },
+  {
+    "url": "https://openreview.net/forum?id=RovX-uQ1Hua",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.58",
+    "confidences": [
+      3,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Current approaches to text generation largely rely on autoregressive models and maximum likelihood estimation. This paradigm leads to (i) diverse but low-quality samples due to mismatched learning objective and evaluation metric (likelihood vs. quality) and (ii) exposure bias due to mismatched history distributions (gold vs. model-generated). To alleviate these problems, we frame text generation as an offline reinforcement learning (RL) problem with expert demonstrations (i.e., the reference), where the goal is to maximize quality given model-generated histories. We propose GOLD (generation by off-policy learning from demonstrations): an easy-to-optimize algorithm that learns from the demonstrations by importance weighting. Intuitively, GOLD upweights confident tokens and downweights unconfident ones in the reference during training, avoiding optimization issues faced by prior RL approaches that rely on online data collection. According to both automatic and human evaluation, models trained by GOLD outperform those trained by MLE and policy gradient on summarization, question generation, and machine translation. Further, our models are less sensitive to decoding algorithms and alleviate exposure bias.",
+    "title": "Text Generation by Learning from Demonstrations",
+    "authors": [
+      "Richard Yuanzhe Pang",
+      "He He"
+    ],
+    "emails": [
+      "~Richard_Yuanzhe_Pang1",
+      "~He_He2"
+    ],
+    "rank": 413
+  },
+  {
+    "url": "https://openreview.net/forum?id=GTGb3M_KcUl",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.58",
+    "confidences": [
+      2,
+      5,
+      1,
+      4
+    ],
+    "abstract": "Recently, the DL compiler, together with Learning to Compile has proven to be a powerful technique for optimizing deep learning models. However, existing methods focus on accelerating the convergence speed of the individual tensor operator rather than the convergence speed of the entire model, which results in long optimization time to obtain a desired latency.\n\nIn this paper, we present a new method called DynaTune, which provides significantly faster convergence speed to optimize a DNN model. In particular, we consider a Multi-Armed Bandit (MAB) model for the tensor program optimization problem. We use UCB to handle the decision-making of time-slot-based optimization, and we devise a Bayesian belief model that allows predicting the potential performance gain of each operator with uncertainty quantification, which guides the optimization process. We evaluate and compare DynaTune with the state-of-the-art DL compiler. The experiment results show that DynaTune is 1.2--2.4 times faster to achieve the same optimization quality for a range of models across different hardware architectures. ",
+    "title": "DynaTune: Dynamic Tensor Program Optimization in Deep Neural Network Compilation",
+    "authors": [
+      "Minjia Zhang",
+      "Menghao Li",
+      "Chi Wang",
+      "Mingqin Li"
+    ],
+    "emails": [
+      "~Chi_Wang3",
+      "microsoft.com",
+      "~Minjia_Zhang1"
+    ],
+    "rank": 414
+  },
+  {
+    "url": "https://openreview.net/forum?id=_QnwcbR-GG",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      8
+    ],
+    "rating": "6.58",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "A neural implicit outputs a number indicating whether the given query point in space is inside, outside, or on a surface. Many prior works have focused on _latent-encoded_ neural implicits, where a latent vector encoding of a specific shape is also fed as input. While affording latent-space interpolation, this comes at the cost of reconstruction accuracy for any _single_ shape.  Training a specific network for each 3D shape, a _weight-encoded_ neural implicit may forgo the latent vector and focus reconstruction accuracy on the details of a single shape. While previously considered as an intermediary representation for 3D scanning tasks or as a toy-problem leading up to latent-encoding tasks, weight-encoded neural implicits have not yet been taken seriously as a 3D shape representation. In this paper, we establish that weight-encoded neural implicits meet the criteria of a first-class 3D shape representation. We introduce a suite of technical contributions to improve reconstruction accuracy, convergence, and robustness when learning the signed distance field induced by a polygonal mesh --- the _de facto_ standard representation. Viewed as a lossy compression, our conversion outperforms standard techniques from geometry processing. Compared to previous latent- and weight-encoded neural implicits we demonstrate superior robustness, scalability, and performance.",
+    "title": "On the Effectiveness of Weight-Encoded Neural Implicit 3D Shapes   ",
+    "authors": [
+      "Thomas Ryan Davies",
+      "Derek Nowrouzezahrai",
+      "Alec Jacobson"
+    ],
+    "emails": [
+      "~Thomas_Ryan_Davies1",
+      "~Alec_Jacobson1",
+      "~Derek_Nowrouzezahrai1"
+    ],
+    "rank": 415
+  },
+  {
+    "url": "https://openreview.net/forum?id=hkMoYYEkBoI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      8,
+      8
+    ],
+    "rating": "6.57",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Convolutional neural networks (CNN) exhibit unmatched performance in a multitude of computer vision tasks. However, the advantage of using convolutional networks over fully-connected networks is not understood from a theoretical perspective. In this work, we show how convolutional networks can leverage locality in the data, and thus achieve a computational advantage over fully-connected networks. Specifically, we show a class of problems that can be efficiently solved using convolutional networks trained with gradient-descent, but at the same time is hard to learn using a polynomial-size fully-connected network.",
+    "title": "Computational Separation Between Convolutional and Fully-Connected Networks",
+    "authors": [
+      "eran malach",
+      "Shai Shalev-Shwartz"
+    ],
+    "emails": [
+      "~Shai_Shalev-Shwartz1",
+      "~eran_malach1"
+    ],
+    "rank": 416
+  },
+  {
+    "url": "https://openreview.net/forum?id=aUX5Plaq7Oy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.57",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The behavior of many dynamical systems follow complex, yet still unknown partial differential equations (PDEs). While several machine learning methods have been proposed to learn PDEs directly from data, previous methods are limited to discrete-time approximations or make the limiting assumption of the observations arriving at regular grids. We propose a general continuous-time differential model for dynamical systems whose governing equations are parameterized by message passing graph neural networks. The model admits arbitrary space and time discretizations, which removes constraints on the locations of observation points and time intervals between the observations. The model is trained with continuous-time adjoint method enabling efficient neural PDE inference. We demonstrate the model's ability to work with unstructured grids, arbitrary time steps, and noisy observations. We compare our method with existing approaches on several well-known physical systems that involve first and higher-order PDEs with state-of-the-art predictive performance.",
+    "title": "Learning continuous-time PDEs from sparse data with graph neural networks",
+    "authors": [
+      "Valerii Iakovlev",
+      "Markus Heinonen",
+      "Harri L\u00e4hdesm\u00e4ki"
+    ],
+    "emails": [
+      "~Valerii_Iakovlev1",
+      "~Harri_L\u00e4hdesm\u00e4ki1",
+      "~Markus_Heinonen1"
+    ],
+    "rank": 417
+  },
+  {
+    "url": "https://openreview.net/forum?id=tH6_VWZjoq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.57",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "We propose greedy and local search algorithms for rank-constrained convex optimization, namely solving <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-munder><mjx-row><mjx-base style=\"padding-left: 0.995em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mo></mjx-base></mjx-row><mjx-row><mjx-under style=\"padding-top: 0.167em;\"><mjx-mrow size=\"s\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c72\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c61\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2264\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mo class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-mrow></mjx-under></mjx-row></mjx-munder><mjx-mstyle space=\"2\"><mjx-mspace style=\"width: 0.167em;\"></mjx-mspace></mjx-mstyle><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><munder><mo data-mjx-texclass=\"OP\">min</mo><mrow><mrow><mi mathvariant=\"normal\">r</mi><mi mathvariant=\"normal\">a</mi><mi mathvariant=\"normal\">n</mi><mi mathvariant=\"normal\">k</mi></mrow><mo stretchy=\"false\">(</mo><mi>A</mi><mo stretchy=\"false\">)</mo><mo>\u2264</mo><msup><mi>r</mi><mo>\u2217</mo></msup></mrow></munder><mstyle scriptlevel=\"0\"><mspace width=\"thinmathspace\"></mspace></mstyle><mi>R</mi><mo stretchy=\"false\">(</mo><mi>A</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> given a convex function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3A\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi><mo>:</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><mi>m</mi><mo>\u00d7</mo><mi>n</mi></mrow></msup><mo stretchy=\"false\">\u2192</mo><mrow><mi mathvariant=\"double-struck\">R</mi></mrow></math></mjx-assistive-mml></mjx-container> and a parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>r</mi><mo>\u2217</mo></msup></math></mjx-assistive-mml></mjx-container>. These algorithms consist of repeating two steps: (a) adding a new rank-1 matrix to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container> and (b) enforcing the rank constraint on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container>. We refine and improve the theoretical analysis of Shalev-Shwartz et al. (2011), and show that if the rank-restricted condition number of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi></math></mjx-assistive-mml></mjx-container> is <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03ba</mi></math></mjx-assistive-mml></mjx-container>, a solution <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container> with rank <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c6D\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mfrac space=\"2\"><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mrow size=\"s\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-b\"><mjx-c class=\"mjx-c1D7CE TEX-B\"></mjx-c></mjx-mn></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-mrow></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msup space=\"2\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>r</mi><mo>\u2217</mo></msup><mo>\u22c5</mo><mo data-mjx-texclass=\"OP\" movablelimits=\"true\">min</mo><mo fence=\"false\" stretchy=\"false\">{</mo><mi>\u03ba</mi><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mfrac><mrow><mi>R</mi><mo stretchy=\"false\">(</mo><mrow><mn mathvariant=\"bold\">0</mn></mrow><mo stretchy=\"false\">)</mo><mo>\u2212</mo><mi>R</mi><mo stretchy=\"false\">(</mo><msup><mi>A</mi><mo>\u2217</mo></msup><mo stretchy=\"false\">)</mo></mrow><mi>\u03f5</mi></mfrac><mo>,</mo><msup><mi>\u03ba</mi><mn>2</mn></msup><mo fence=\"false\" stretchy=\"false\">}</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2264\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi><mo stretchy=\"false\">(</mo><mi>A</mi><mo stretchy=\"false\">)</mo><mo>\u2264</mo><mi>R</mi><mo stretchy=\"false\">(</mo><msup><mi>A</mi><mo>\u2217</mo></msup><mo stretchy=\"false\">)</mo><mo>+</mo><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> can be recovered, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>A</mi><mo>\u2217</mo></msup></math></mjx-assistive-mml></mjx-container> is the optimal solution. This significantly generalizes associated results on sparse convex optimization, as well as rank-constrained convex optimization for smooth functions. We then introduce new practical variants of these algorithms that have superior runtime and recover better solutions in practice. We demonstrate the versatility of these methods on a wide range of applications involving matrix completion and robust principal component analysis.\n",
+    "title": "Local Search Algorithms for Rank-Constrained Convex Optimization",
+    "authors": [
+      "Kyriakos Axiotis",
+      "Maxim Sviridenko"
+    ],
+    "emails": [
+      "verizonmedia.com",
+      "~Kyriakos_Axiotis1"
+    ],
+    "rank": 418
+  },
+  {
+    "url": "https://openreview.net/forum?id=4qR3coiNaIv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.57",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Bayesian inference over the reward presents an ideal solution to the ill-posed nature of the inverse reinforcement learning problem. Unfortunately current methods generally do not scale well beyond the small tabular setting due to the need for an inner-loop MDP solver, and even non-Bayesian methods that do themselves scale often require extensive interaction with the environment to perform well, being inappropriate for high stakes or costly applications such as healthcare. In this paper we introduce our method, Approximate Variational Reward Imitation Learning (AVRIL), that addresses both of these issues by jointly learning an approximate posterior distribution over the reward that scales to arbitrarily complicated state spaces alongside an appropriate policy in a completely offline manner through a variational approach to said latent reward. Applying our method to real medical data alongside classic control simulations, we demonstrate Bayesian reward inference in environments beyond the scope of current methods, as well as task performance competitive with focused offline imitation learning algorithms.",
+    "title": "Scalable Bayesian Inverse Reinforcement Learning",
+    "authors": [
+      "Alex James Chan",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Mihaela_van_der_Schaar2",
+      "~Alex_James_Chan1"
+    ],
+    "rank": 419
+  },
+  {
+    "url": "https://openreview.net/forum?id=Yz-XtK5RBxB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.57",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose the deep repulsive clustering (DRC) algorithm of ordered data for effective order learning. First, we develop the order-identity decomposition (ORID) network to divide the information of an object instance into an order-related feature and an identity feature. Then, we group object instances into clusters according to their identity features using a repulsive term. Moreover, we estimate the rank of a test instance, by comparing it with references within the same cluster. Experimental results on facial age estimation, aesthetic score regression, and historical color image classification show that the proposed algorithm can cluster ordered data effectively and also yield excellent rank estimation performance.",
+    "title": "Deep Repulsive Clustering of Ordered Data Based on Order-Identity Decomposition",
+    "authors": [
+      "Seon-Ho Lee",
+      "Chang-Su Kim"
+    ],
+    "emails": [
+      "~Chang-Su_Kim4",
+      "~Seon-Ho_Lee1"
+    ],
+    "rank": 420
+  },
+  {
+    "url": "https://openreview.net/forum?id=C0qJUx5dxFb",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.57",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "The largely successful method of training neural networks is to learn their weights using some variant of stochastic gradient descent (SGD). Here, we show that the solutions found by SGD can be further improved by ensembling a subset of the weights in late stages of learning. At the end of learning, we obtain back a single model by taking a spatial average in weight space. To avoid incurring increased computational costs, we investigate a family of low-dimensional late-phase weight models which interact multiplicatively with the remaining parameters. Our results show that augmenting standard models with late-phase weights improves generalization in established benchmarks such as CIFAR-10/100, ImageNet and enwik8. These findings are complemented with a theoretical analysis of a noisy quadratic problem which provides a simplified picture of the late phases of neural network learning.",
+    "title": "Neural networks with late-phase weights",
+    "authors": [
+      "Johannes Von Oswald",
+      "Seijin Kobayashi",
+      "Joao Sacramento",
+      "Alexander Meulemans",
+      "Christian Henning",
+      "Benjamin F Grewe"
+    ],
+    "emails": [
+      "ethz.ch",
+      "~Joao_Sacramento1",
+      "~Johannes_Von_Oswald1",
+      "~Alexander_Meulemans1",
+      "~Benjamin_F_Grewe1",
+      "~Christian_Henning1"
+    ],
+    "rank": 421
+  },
+  {
+    "url": "https://openreview.net/forum?id=wXgk_iCiYGo",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.57",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Stochastic Gradient Descent (SGD) and its variants are mainstream methods for training deep networks in practice. SGD is known to find a flat minimum that often generalizes well. However, it is mathematically unclear how deep learning can select a flat minimum among so many minima. To answer the question quantitatively, we develop a density diffusion theory to reveal how minima selection quantitatively depends on the minima sharpness and the hyperparameters. To the best of our knowledge, we are the first to theoretically and empirically prove that, benefited from the Hessian-dependent covariance of stochastic gradient noise, SGD favors flat minima exponentially more than sharp minima, while Gradient Descent (GD) with injected white noise favors flat minima only polynomially more than sharp minima. We also reveal that either a small learning rate or large-batch training requires exponentially many iterations to escape from minima in terms of the ratio of the batch size and learning rate. Thus, large-batch training cannot search flat minima efficiently in a realistic computational time.",
+    "title": "A Diffusion Theory For Deep Learning Dynamics: Stochastic Gradient Descent Exponentially Favors Flat Minima",
+    "authors": [
+      "Zeke Xie",
+      "Issei Sato",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Zeke_Xie1",
+      "~Issei_Sato1",
+      "~Masashi_Sugiyama1"
+    ],
+    "rank": 422
+  },
+  {
+    "url": "https://openreview.net/forum?id=m4UCf24r0Y",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.57",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "A popular approach to model compression is to train an inexpensive student model to mimic the class probabilities of a highly accurate but cumbersome teacher model. Surprisingly, this two-step knowledge distillation process often leads to higher accuracy than training the student directly on labeled data. To explain and enhance this phenomenon, we cast knowledge distillation as a semiparametric inference problem with the optimal student model as the target, the unknown Bayes class probabilities as nuisance, and the teacher probabilities as a plug-in nuisance estimate.  By adapting modern semiparametric tools, we derive new guarantees for the prediction error of standard distillation and develop two enhancements\u2014cross-fitting and loss correction\u2014to mitigate the impact of teacher overfitting and underfitting on student performance. We validate our findings empirically on both tabular and image data and observe consistent improvements from our knowledge distillation enhancements.",
+    "title": "Knowledge Distillation as Semiparametric Inference",
+    "authors": [
+      "Tri Dao",
+      "Govinda M Kamath",
+      "Vasilis Syrgkanis",
+      "Lester Mackey"
+    ],
+    "emails": [
+      "~Lester_Mackey1",
+      "~Vasilis_Syrgkanis1",
+      "microsoft.com",
+      "~Tri_Dao1"
+    ],
+    "rank": 423
+  },
+  {
+    "url": "https://openreview.net/forum?id=2K5WDVL2KI",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.57",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We introduce Information Condensing Active Learning (ICAL), a batch mode model agnostic Active Learning (AL) method targeted at Deep Bayesian Active Learning that focuses on acquiring labels for points which have as much information as possible about the still unacquired points. ICAL uses the Hilbert Schmidt Independence Criterion (HSIC) to measure the strength of the dependency between a candidate batch of points and the unlabeled set. We develop key optimizations that allow us to scale our method to large unlabeled sets. We show significant improvements in terms of model accuracy and negative log likelihood (NLL) on several image datasets compared to state of the art batch mode AL methods for deep learning.",
+    "title": "Information Condensing Active Learning",
+    "authors": [
+      "Siddhartha Jain",
+      "Ge Liu",
+      "David Gifford"
+    ],
+    "emails": [
+      "~Siddhartha_Jain1",
+      "~Ge_Liu2",
+      "~David_Gifford1"
+    ],
+    "rank": 424
+  },
+  {
+    "url": "https://openreview.net/forum?id=XI-OJ5yyse",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.57",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Graph-structured data are ubiquitous. However, graphs encode diverse types of information and thus play different roles in data representation. In this paper, we distinguish the \\textit{representational} and the \\textit{correlational} roles played by the graphs in node-level prediction tasks, and we investigate how Graph Neural Network (GNN) models can effectively leverage both types of information. Conceptually, the representational information provides guidance for the model to construct better node features; while the correlational information indicates the correlation between node outcomes conditional on node features. Through a simulation study, we find that many popular GNN models are incapable of effectively utilizing the correlational information. By leveraging the idea of the copula, a principled way to describe the dependence among multivariate random variables, we offer a general solution. The proposed Copula Graph Neural Network (CopulaGNN) can take a wide range of GNN models as base models and utilize both representational and correlational information stored in the graphs. Experimental results on two types of regression tasks verify the effectiveness of the proposed method.",
+    "title": "CopulaGNN: Towards Integrating Representational and Correlational Roles of Graphs in Graph Neural Networks",
+    "authors": [
+      "Jiaqi Ma",
+      "Bo Chang",
+      "Xuefei Zhang",
+      "Qiaozhu Mei"
+    ],
+    "emails": [
+      "~Jiaqi_Ma1",
+      "~Bo_Chang1",
+      "~Xuefei_Zhang1",
+      "~Qiaozhu_Mei1"
+    ],
+    "rank": 425
+  },
+  {
+    "url": "https://openreview.net/forum?id=a3wKPZpGtCF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.57",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "It is of primary interest for ML to understand how agents learn and interact dynamically in competitive environments and games (e.g. GANs). But this has been a difficult task, as irregular behaviors are commonly observed in such systems. This can be explained theoretically, for instance, by the works of Cheung and Piliouras (COLT 2019; NeurIPS 2020), which showed that in two-person zero-sum games, if agents employ one of the most well-known learning algorithms, Multiplicative Weights Update (MWU), then Lyapunov chaos occurs everywhere in the payoff space. In this paper, we study how persistent chaos can occur in the more general normal game settings, where the agents might have the motivation to coordinate (which is not true for zero-sum games) and the number of agents can be arbitrary.\n\nWe characterize bimatrix games where MWU, its optimistic variant (OMWU) or Follow-the-Regularized-Leader (FTRL) algorithms are Lyapunov chaotic almost everywhere in the payoff space. Technically, our characterization is derived by extending the volume-expansion argument of Cheung and Piliouras via the canonical game decomposition into zero-sum and coordination components. Interestingly, the two components induce opposite volume-changing behaviors, so the overall behavior can be analyzed by comparing the strengths of the components against each other. The comparison is done via our new notion of \"matrix domination\" or via a linear program. For multi-player games, we present a local equivalence of volume change between general games and graphical games, which is used to perform volume and chaos analyses of MWU and OMWU in potential games.",
+    "title": "Chaos of Learning Beyond Zero-sum and Coordination via Game Decompositions",
+    "authors": [
+      "Yun Kuen Cheung",
+      "Yixin Tao"
+    ],
+    "emails": [
+      "nyu.edu",
+      "~Yun_Kuen_Cheung1"
+    ],
+    "rank": 426
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qm8UNVCFdh",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      9,
+      4
+    ],
+    "rating": "6.56",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Learning effective representations of visual data that generalize to a variety of downstream tasks has been a long quest for computer vision. Most representation learning approaches rely solely on visual data such as images or videos. In this paper, we explore a novel approach, where we use human interaction and attention cues to investigate whether we can learn better representations compared to visual-only representations. For this study, we collect a dataset of human interactions capturing body part movements and gaze in their daily lives. Our experiments show that our ``\"muscly-supervised\" representation that encodes interaction and attention cues outperforms a visual-only state-of-the-art method MoCo (He et al.,2020), on a variety of target tasks: scene classification (semantic), action recognition (temporal), depth estimation (geometric), dynamics prediction (physics) and walkable surface estimation (affordance). Our code and dataset are available at: <a href=\"https://github.com/ehsanik/muscleTorch\" target=\"_blank\" rel=\"nofollow\">https://github.com/ehsanik/muscleTorch</a>.",
+    "title": "What Can You Learn From Your Muscles? Learning Visual Representation from Human Interactions",
+    "authors": [
+      "Kiana Ehsani",
+      "Daniel Gordon",
+      "Thomas Hai Dang Nguyen",
+      "Roozbeh Mottaghi",
+      "Ali Farhadi"
+    ],
+    "emails": [
+      "~Roozbeh_Mottaghi1",
+      "~Kiana_Ehsani1",
+      "~Thomas_Hai_Dang_Nguyen1",
+      "~Daniel_Gordon1",
+      "~Ali_Farhadi3"
+    ],
+    "rank": 427
+  },
+  {
+    "url": "https://openreview.net/forum?id=0N8jUH4JMv6",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      7
+    ],
+    "rating": "6.56",
+    "confidences": [
+      4,
+      3,
+      2
+    ],
+    "abstract": "We study training of Convolutional Neural Networks (CNNs) with ReLU activations and introduce exact convex optimization formulations with a polynomial complexity with respect to the number of data samples, the number of neurons, and data dimension. More specifically, we develop a convex analytic framework utilizing semi-infinite duality to obtain equivalent convex optimization problems for several two- and three-layer CNN architectures. We first prove that two-layer CNNs can be globally optimized via an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> norm regularized convex program. We then show that multi-layer circular CNN training problems with a single ReLU layer are equivalent to an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> regularized convex program that encourages sparsity in the spectral domain. We also extend these results to three-layer CNNs with two ReLU layers. Furthermore, we present extensions of our approach to different pooling methods, which elucidates the implicit architectural bias as convex regularizers.",
+    "title": "Implicit Convex Regularizers of CNN Architectures: Convex Optimization of Two- and Three-Layer Networks in Polynomial Time",
+    "authors": [
+      "Tolga Ergen",
+      "Mert Pilanci"
+    ],
+    "emails": [
+      "~Mert_Pilanci3",
+      "~Tolga_Ergen1"
+    ],
+    "rank": 428
+  },
+  {
+    "url": "https://openreview.net/forum?id=mCLVeEpplNE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.56",
+    "confidences": [
+      3,
+      2,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Machine learning applications such as finance and medicine demand accurate and justifiable predictions, barring most deep learning methods from use. In response, previous work combines decision trees with deep learning, yielding models that (1) sacrifice interpretability for accuracy or (2) sacrifice accuracy for interpretability. We forgo this dilemma by jointly improving accuracy and interpretability using Neural-Backed Decision Trees (NBDTs). NBDTs replace a neural network's final linear layer with a differentiable sequence of decisions and a surrogate loss. This forces the model to learn high-level concepts and lessens reliance on highly-uncertain decisions, yielding (1) accuracy: NBDTs match or outperform modern neural networks on CIFAR, ImageNet and better generalize to unseen classes by up to 16%. Furthermore, our surrogate loss improves the original model's accuracy by up to 2%. NBDTs also afford (2) interpretability: improving human trustby clearly identifying model mistakes and assisting in dataset debugging. Code and pretrained NBDTs are at <a href=\"https://github.com/alvinwan/neural-backed-decision-trees\" target=\"_blank\" rel=\"nofollow\">https://github.com/alvinwan/neural-backed-decision-trees</a>.",
+    "title": "NBDT: Neural-Backed Decision Tree",
+    "authors": [
+      "Alvin Wan",
+      "Lisa Dunlap",
+      "Daniel Ho",
+      "Jihan Yin",
+      "Scott Lee",
+      "Suzanne Petryk",
+      "Sarah Adel Bargal",
+      "Joseph E. Gonzalez"
+    ],
+    "emails": [
+      "~Lisa_Dunlap1",
+      "~Suzanne_Petryk1",
+      "~Sarah_Adel_Bargal1",
+      "~Daniel_Ho2",
+      "~Joseph_E._Gonzalez1",
+      "~Alvin_Wan1",
+      "~Jihan_Yin1",
+      "~Scott_Lee2"
+    ],
+    "rank": 429
+  },
+  {
+    "url": "https://openreview.net/forum?id=QO9-y8also-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      5,
+      6
+    ],
+    "rating": "6.56",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Feature visualizations such as synthetic maximally activating images are a widely used explanation method to better understand the information processing of convolutional neural networks (CNNs). At the same time, there are concerns that these visualizations might not accurately represent CNNs' inner workings. Here, we measure how much extremely activating images help humans to predict CNN activations.\nUsing a well-controlled psychophysical paradigm, we compare the informativeness of synthetic images by Olah et al. (2017) with a simple baseline visualization, namely exemplary natural images that also strongly activate a specific feature map. Given either synthetic or natural reference images, human participants choose which of two query images leads to strong positive activation. The experiment is designed to maximize participants' performance, and is the first to probe intermediate instead of final layer representations. We find that synthetic images indeed provide helpful information about feature map activations (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>82</mn><mo>\u00b1</mo><mn>4</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> accuracy; chance would be <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>50</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>). However, natural images --- originally intended to be a baseline --- outperform these synthetic images by a wide margin (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>92</mn><mo>\u00b1</mo><mn>2</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>). Additionally, participants are faster and more confident for natural images, whereas subjective impressions about the interpretability of the feature visualizations by Olah et al. (2017) are mixed. The higher informativeness of natural images holds across most layers, for both expert and lay participants as well as for hand- and randomly-picked feature visualizations. Even if only a single reference image is given, synthetic images provide less information than natural images (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>65</mn><mo>\u00b1</mo><mn>5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> vs. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>73</mn><mo>\u00b1</mo><mn>4</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>). In summary, synthetic images from a popular feature visualization method are significantly less informative for assessing CNN activations than natural images. We argue that visualization methods should improve over this simple baseline.",
+    "title": "Exemplary Natural Images Explain CNN Activations Better than State-of-the-Art Feature Visualization",
+    "authors": [
+      "Judy Borowski",
+      "Roland Simon Zimmermann",
+      "Judith Schepers",
+      "Robert Geirhos",
+      "Thomas S. A. Wallis",
+      "Matthias Bethge",
+      "Wieland Brendel"
+    ],
+    "emails": [
+      "~Wieland_Brendel1",
+      "~Roland_Simon_Zimmermann1",
+      "~Matthias_Bethge1",
+      "~Judy_Borowski1",
+      "web.de",
+      "~Robert_Geirhos1",
+      "~Thomas_S._A._Wallis1"
+    ],
+    "rank": 430
+  },
+  {
+    "url": "https://openreview.net/forum?id=Vfs_2RnOD0H",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.56",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Checkpointing enables the training of deep learning models under restricted memory budgets by freeing intermediate activations from memory and recomputing them on demand. Current checkpointing techniques statically plan these recomputations offline and assume static computation graphs. We demonstrate that a simple online algorithm can achieve comparable performance by introducing Dynamic Tensor Rematerialization (DTR), a greedy online algorithm for checkpointing that is extensible and general, is parameterized by eviction policy, and supports dynamic models. We prove that DTR can train an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container>-layer linear feedforward network on an  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.166em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><msqrt><mi>N</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> memory budget with only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>N</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> tensor operations. DTR closely matches the performance of optimal static checkpointing in simulated experiments. We incorporate a DTR prototype into PyTorch merely by interposing on tensor allocations and operator calls and collecting lightweight metadata on tensors.",
+    "title": "Dynamic Tensor Rematerialization",
+    "authors": [
+      "Marisa Kirisame",
+      "Steven Lyubomirsky",
+      "Altan Haan",
+      "Jennifer Brennan",
+      "Mike He",
+      "Jared Roesch",
+      "Tianqi Chen",
+      "Zachary Tatlock"
+    ],
+    "emails": [
+      "~Tianqi_Chen1",
+      "~Zachary_Tatlock1",
+      "~Steven_Lyubomirsky1",
+      "cs.washington.edu"
+    ],
+    "rank": 431
+  },
+  {
+    "url": "https://openreview.net/forum?id=04ArenGOz3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.55",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Set prediction is about learning to predict a collection of unordered variables with unknown interrelations. Training such models with set losses imposes the structure of a metric space over sets. We focus on stochastic and underdefined cases, where an incorrectly chosen loss function leads to implausible predictions. Example tasks include conditional point-cloud reconstruction and predicting future states of molecules. In this paper we propose an alternative to training via set losses, by viewing learning as conditional density estimation. Our learning framework fits deep energy-based models and approximates the intractable likelihood with gradient-guided sampling. Furthermore, we propose a stochastically augmented prediction algorithm that enables multiple predictions, reflecting the possible variations in the target set. We empirically demonstrate on a variety of datasets the capability to learn multi-modal densities and produce different plausible predictions. Our approach is competitive with previous set prediction models on standard benchmarks. More importantly, it extends the family of addressable tasks beyond those that have unambiguous predictions.",
+    "title": "Set Prediction without Imposing Structure as Conditional Density Estimation",
+    "authors": [
+      "David W Zhang",
+      "Gertjan J. Burghouts",
+      "Cees G. M. Snoek"
+    ],
+    "emails": [
+      "tno.nl",
+      "~Cees_G._M._Snoek1",
+      "~David_W_Zhang1"
+    ],
+    "rank": 432
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZK6vTvb84s",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.55",
+    "confidences": [
+      3,
+      4,
+      2,
+      2
+    ],
+    "abstract": "We address the problem of learning on sets of features, motivated by the need of performing pooling operations in long biological sequences of varying sizes, with long-range dependencies, and possibly few labeled data. To address this challenging task, we introduce a parametrized representation of fixed size, which  embeds and then aggregates elements from a given input set according to the optimal transport plan between the set and a trainable reference. Our approach scales to large datasets and allows end-to-end training of the reference, while also providing a simple unsupervised learning mechanism with small computational cost. Our aggregation technique admits two useful interpretations: it may be seen as a mechanism related to attention layers in neural networks, or it may be seen as a scalable surrogate of a classical optimal transport-based kernel. We experimentally demonstrate the effectiveness of our approach on biological sequences, achieving state-of-the-art results for protein fold recognition and detection of chromatin profiles tasks, and, as a proof of concept, we show promising results for processing natural language sequences. We provide an open-source implementation of our embedding that can be used alone or as a module in larger learning models at <a href=\"https://github.com/claying/OTK\" target=\"_blank\" rel=\"nofollow\">https://github.com/claying/OTK</a>.",
+    "title": "A Trainable Optimal Transport Embedding for Feature Aggregation and its Relationship to Attention",
+    "authors": [
+      "Gr\u00e9goire Mialon",
+      "Dexiong Chen",
+      "Alexandre d'Aspremont",
+      "Julien Mairal"
+    ],
+    "emails": [
+      "~Julien_Mairal1",
+      "~Gr\u00e9goire_Mialon1",
+      "~Dexiong_Chen1",
+      "~Alexandre_d'Aspremont1"
+    ],
+    "rank": 433
+  },
+  {
+    "url": "https://openreview.net/forum?id=tGZu6DlbreV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      6,
+      7
+    ],
+    "rating": "6.55",
+    "confidences": [
+      4,
+      1,
+      2,
+      4
+    ],
+    "abstract": "This paper studies learning logic rules for reasoning on knowledge graphs. Logic rules provide interpretable explanations when used for prediction as well as being able to generalize to other tasks, and hence are critical to learn. Existing methods either suffer from the problem of searching in a large search space (e.g., neural logic programming) or ineffective optimization due to sparse rewards (e.g., techniques based on reinforcement learning). To address these limitations, this paper proposes a probabilistic model called RNNLogic. RNNLogic treats logic rules as a latent variable, and simultaneously trains a rule generator as well as a reasoning predictor with logic rules. We develop an EM-based algorithm for optimization. In each iteration, the reasoning predictor is updated to explore some generated logic rules for reasoning. Then in the E-step, we select a set of high-quality rules from all generated rules with both the rule generator and reasoning predictor via posterior inference; and in the M-step, the rule generator is updated with the rules selected in the E-step. Experiments on four datasets prove the effectiveness of RNNLogic.",
+    "title": "RNNLogic: Learning Logic Rules for Reasoning on Knowledge Graphs",
+    "authors": [
+      "Meng Qu",
+      "Junkun Chen",
+      "Louis-Pascal Xhonneux",
+      "Yoshua Bengio",
+      "Jian Tang"
+    ],
+    "emails": [
+      "mila.quebec",
+      "mails.tsinghua.edu.cn",
+      "~Meng_Qu2",
+      "~Yoshua_Bengio1",
+      "~Jian_Tang1"
+    ],
+    "rank": 434
+  },
+  {
+    "url": "https://openreview.net/forum?id=0IO5VdnSAaH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.55",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "We prove a non-asymptotic distribution-independent lower bound for the expected mean squared generalization error caused by label noise in ridgeless linear regression. Our lower bound generalizes a similar known result to the overparameterized (interpolating) regime. In contrast to most previous works, our analysis applies to a broad class of input distributions with almost surely full-rank feature matrices, which allows us to cover various types of deterministic or random feature maps. Our lower bound is asymptotically sharp and implies that in the presence of label noise, ridgeless linear regression does not perform well around the interpolation threshold for any of these feature maps. We analyze the imposed assumptions in detail and provide a theory for analytic (random) feature maps. Using this theory, we can show that our assumptions are satisfied for input distributions with a (Lebesgue) density and feature maps given by random deep neural networks with analytic activation functions like sigmoid, tanh, softplus or GELU. As further examples, we show that feature maps from random Fourier features and polynomial kernels also satisfy our assumptions. We complement our theory with further experimental and analytic results.",
+    "title": "On the Universality of the Double Descent Peak in Ridgeless Regression",
+    "authors": [
+      "David Holzm\u00fcller"
+    ],
+    "emails": [
+      "~David_Holzm\u00fcller1"
+    ],
+    "rank": 435
+  },
+  {
+    "url": "https://openreview.net/forum?id=jC9G3ns6jH",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      8
+    ],
+    "rating": "6.55",
+    "confidences": [
+      4,
+      3,
+      2,
+      2
+    ],
+    "abstract": "In the past few years, various approaches have been developed to explain and interpret deep neural network (DNN) representations, but it has been pointed out that these representations are sometimes unstable and not reproducible. In this paper, we interpret these representations as hypotheses driven by DNN (called DNN-driven hypotheses) and propose a method to quantify the reliability of these hypotheses in statistical hypothesis testing framework. To this end, we introduce Selective Inference (SI) framework, which has received much attention in the past few years as a new statistical inference framework for data-driven hypotheses. The basic idea of SI is to make conditional inferences on the selected hypotheses under the condition that they are selected. In order to use SI framework for DNN representations, we develop a new SI algorithm based on homotopy method which enables us to derive the exact (non-asymptotic) conditional sampling distribution of the DNN-driven hypotheses. We conduct experiments on both synthetic and real-world datasets, through which we offer evidence that our proposed method can successfully control the false positive rate, has decent performance in terms of computational efficiency, and provides good results in practical applications.",
+    "title": "Quantifying Statistical Significance of Neural Network Representation-Driven Hypotheses by Selective Inference",
+    "authors": [
+      "Vo Nguyen Le Duy",
+      "Shogo Iwazaki",
+      "Ichiro Takeuchi"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Ichiro_Takeuchi1",
+      "~Vo_Nguyen_Le_Duy1"
+    ],
+    "rank": 436
+  },
+  {
+    "url": "https://openreview.net/forum?id=enoVQWLsfyL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.54",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Many recent methods for unsupervised representation learning train models to be invariant to different \"views,\" or distorted versions of an input. However, designing these views requires considerable trial and error by human experts, hindering widespread adoption of unsupervised representation learning methods across domains and modalities. To address this, we propose viewmaker networks: generative models that learn to produce useful views from a given input. Viewmakers are stochastic bounded adversaries: they produce views by generating and then adding an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container>-bounded perturbation to the input, and are trained adversarially with respect to the main encoder network. Remarkably, when pretraining on CIFAR-10, our learned views enable comparable transfer accuracy to the well-tuned SimCLR augmentations---despite not including transformations like cropping or color jitter. Furthermore, our learned views significantly outperform baseline augmentations on speech recordings (+9 points on average) and wearable sensor data (+17 points on average). Viewmaker views can also be combined with handcrafted views: they improve robustness to common image corruptions and can increase transfer performance in cases where handcrafted views are less explored. These results suggest that viewmakers may provide a path towards more general representation learning algorithms---reducing the domain expertise and effort needed to pretrain on a much wider set of domains. Code is available at <a href=\"https://github.com/alextamkin/viewmaker\" target=\"_blank\" rel=\"nofollow\">https://github.com/alextamkin/viewmaker</a>.",
+    "title": "Viewmaker Networks: Learning Views for Unsupervised Representation Learning",
+    "authors": [
+      "Alex Tamkin",
+      "Mike Wu",
+      "Noah Goodman"
+    ],
+    "emails": [
+      "~Noah_Goodman1",
+      "~Alex_Tamkin1",
+      "~Mike_Wu1"
+    ],
+    "rank": 437
+  },
+  {
+    "url": "https://openreview.net/forum?id=bhCDO_cEGCz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.54",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We study the problem of dynamic visual reasoning on raw videos. This is a challenging problem; currently, state-of-the-art models often require dense supervision on physical object properties and events from simulation, which are impractical to obtain in real life. In this paper, we present the Dynamic Concept Learner (DCL), a unified framework that grounds physical objects and events from video and language. DCL first adopts a trajectory extractor to track each object over time and to represent it as a latent, object-centric feature vector. Building upon this object-centric representation, DCL learns to approximate the dynamic interaction among objects using graph networks. DCL further incorporates a semantic parser to parse question into semantic programs and, finally, a program executor to run the program to answer the question, levering the learned dynamics model. After training, DCL can detect and associate objects across the frames, ground visual properties and physical events, understand the causal relationship between events, make future and counterfactual predictions, and leverage these extracted presentations for answering queries. DCL achieves state-of-the-art performance on CLEVRER, a challenging causal video reasoning dataset, even without using ground-truth attributes and collision labels from simulations for training. We further test DCL on a newly proposed video-retrieval and event localization dataset derived from CLEVRER, showing its strong generalization capacity.",
+    "title": "Grounding Physical Concepts of Objects and Events Through Dynamic Visual Reasoning",
+    "authors": [
+      "Zhenfang Chen",
+      "Jiayuan Mao",
+      "Jiajun Wu",
+      "Kwan-Yee Kenneth Wong",
+      "Joshua B. Tenenbaum",
+      "Chuang Gan"
+    ],
+    "emails": [
+      "~Jiajun_Wu1",
+      "~Jiayuan_Mao1",
+      "~Zhenfang_Chen1",
+      "~Joshua_B._Tenenbaum1",
+      "~Chuang_Gan1",
+      "~Kwan-Yee_Kenneth_Wong2"
+    ],
+    "rank": 438
+  },
+  {
+    "url": "https://openreview.net/forum?id=PbEHqvFtcS",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.54",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "We study adversary-resilient stochastic distributed optimization, in which <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container> machines can independently compute stochastic gradients, and cooperate to jointly optimize over their local objective functions. However, an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container>-fraction of the machines are Byzantine, in that they may behave in arbitrary, adversarial ways. We consider a variant of this procedure in the challenging non-convex case. Our main result is a new algorithm SafeguardSGD, which can provably escape saddle points and find approximate local minima of the non-convex objective. The algorithm is based on a new concentration filtering technique, and its sample and time complexity bounds match the best known theoretical bounds in the stochastic, distributed setting when no Byzantine machines are present. \n\nOur algorithm is very practical: it improves upon the performance of all prior methods when training deep neural networks, it is relatively lightweight, and it is the first method to withstand two recently-proposed Byzantine attacks. ",
+    "title": "Byzantine-Resilient Non-Convex Stochastic Gradient Descent",
+    "authors": [
+      "Zeyuan Allen-Zhu",
+      "Faeze Ebrahimianghazani",
+      "Jerry Li",
+      "Dan Alistarh"
+    ],
+    "emails": [
+      "~Dan_Alistarh7",
+      "gmail.com",
+      "~Jerry_Li1",
+      "~Zeyuan_Allen-Zhu1"
+    ],
+    "rank": 439
+  },
+  {
+    "url": "https://openreview.net/forum?id=n6jl7fLxrP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      9,
+      6
+    ],
+    "rating": "6.54",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "In many important graph data processing applications the acquired information includes both node features and observations of the graph topology. Graph neural networks (GNNs) are designed to exploit both sources of evidence but they do not optimally trade-off their utility and integrate them in a manner that is also universal. Here, universality refers to independence on homophily or heterophily graph assumptions. We address these issues by introducing a new Generalized PageRank (GPR) GNN architecture that adaptively learns the GPR weights so as to jointly optimize node feature and topological information extraction, regardless of the extent to which the node labels are homophilic or heterophilic. Learned GPR weights automatically adjust to the node label pattern, irrelevant on the type of initialization, and thereby guarantee excellent learning performance for label patterns that are usually hard to handle. Furthermore, they allow one to avoid feature over-smoothing, a process which renders feature information nondiscriminative, without requiring the network to be shallow. Our accompanying theoretical analysis of the GPR-GNN method is facilitated by novel synthetic benchmark datasets generated by the so-called contextual stochastic block model. We also compare the performance of our GNN architecture with that of several state-of-the-art GNNs on the problem of node-classification, using well-known benchmark homophilic and heterophilic datasets. The results demonstrate that GPR-GNN offers significant performance improvement compared to existing techniques on both synthetic and benchmark data.",
+    "title": "Adaptive Universal Generalized PageRank Graph Neural Network",
+    "authors": [
+      "Eli Chien",
+      "Jianhao Peng",
+      "Pan Li",
+      "Olgica Milenkovic"
+    ],
+    "emails": [
+      "~Olgica_Milenkovic1",
+      "~Eli_Chien1",
+      "~Jianhao_Peng1",
+      "~Pan_Li2"
+    ],
+    "rank": 440
+  },
+  {
+    "url": "https://openreview.net/forum?id=O9bnihsFfXU",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.54",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "We identify an implicit under-parameterization phenomenon in value-based deep RL methods that use bootstrapping: when value functions, approximated using deep neural networks, are trained with gradient descent using iterated regression onto target values generated by previous instances of the value network, more gradient updates decrease the expressivity of the current value network. We char- acterize this loss of expressivity via a drop in the rank of the learned value net- work features, and show that this typically corresponds to a performance drop. We demonstrate this phenomenon on Atari and Gym benchmarks, in both offline and online RL settings. We formally analyze this phenomenon and show that it results from a pathological interaction between bootstrapping and gradient-based optimization. We further show that mitigating implicit under-parameterization by controlling rank collapse can improve performance.",
+    "title": "Implicit Under-Parameterization Inhibits Data-Efficient Deep Reinforcement Learning",
+    "authors": [
+      "Aviral Kumar",
+      "Rishabh Agarwal",
+      "Dibya Ghosh",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Dibya_Ghosh1",
+      "~Sergey_Levine1",
+      "~Aviral_Kumar2",
+      "~Rishabh_Agarwal2"
+    ],
+    "rank": 441
+  },
+  {
+    "url": "https://openreview.net/forum?id=q8qLAbQBupm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.54",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Understanding the dynamics of neural network parameters during training is one of the key challenges in building a theoretical foundation for deep learning. A central obstacle is that the motion of a network in high-dimensional parameter space undergoes discrete finite steps along complex stochastic gradients derived from real-world datasets. We circumvent this obstacle through a unifying theoretical framework based on intrinsic symmetries embedded in a network's architecture that are present for any dataset. We show that any such symmetry imposes stringent geometric constraints on gradients and Hessians, leading to an associated conservation law in the continuous-time limit of stochastic gradient descent (SGD), akin to Noether's theorem in physics. We further show that finite learning rates used in practice can actually break these symmetry induced conservation laws. We apply tools from finite difference methods to derive modified gradient flow, a differential equation that better approximates the numerical trajectory taken by SGD at finite learning rates. We combine modified gradient flow with our framework of symmetries to derive exact integral expressions for the dynamics of certain parameter combinations. We empirically validate our analytic expressions for learning dynamics on VGG-16 trained on Tiny ImageNet. Overall, by exploiting symmetry, our work demonstrates that we can analytically describe the learning dynamics of various parameter combinations at finite learning rates and batch sizes for state of the art architectures trained on any dataset.",
+    "title": "Neural Mechanics: Symmetry and Broken Conservation Laws in Deep Learning Dynamics",
+    "authors": [
+      "Daniel Kunin",
+      "Javier Sagastuy-Brena",
+      "Surya Ganguli",
+      "Daniel LK Yamins",
+      "Hidenori Tanaka"
+    ],
+    "emails": [
+      "~Javier_Sagastuy-Brena1",
+      "~Daniel_LK_Yamins1",
+      "~Surya_Ganguli1",
+      "~Hidenori_Tanaka1",
+      "~Daniel_Kunin1"
+    ],
+    "rank": 442
+  },
+  {
+    "url": "https://openreview.net/forum?id=1Jv6b0Zq3qi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "For many practical, high-risk applications, it is essential to quantify uncertainty in a model's predictions to avoid costly mistakes. While predictive uncertainty is widely studied for neural networks, the topic seems to be under-explored for models based on gradient boosting. However, gradient boosting often achieves state-of-the-art results on tabular data. This work examines a probabilistic ensemble-based framework for deriving uncertainty estimates in the predictions of gradient boosting classification and regression models. We conducted experiments on a range of synthetic and real datasets and investigated the applicability of ensemble approaches to gradient boosting models that are themselves ensembles of decision trees. Our analysis shows that ensembles of gradient boosting models successfully detect anomalous inputs while having limited ability to improve the predicted total uncertainty. Importantly, we also propose a concept of a virtual ensemble to get the benefits of an ensemble via only one gradient boosting model, which significantly reduces complexity. ",
+    "title": "Uncertainty in Gradient Boosting via Ensembles",
+    "authors": [
+      "Andrey Malinin",
+      "Liudmila Prokhorenkova",
+      "Aleksei Ustimenko"
+    ],
+    "emails": [
+      "yandex-team.ru",
+      "~Andrey_Malinin1",
+      "~Liudmila_Prokhorenkova1"
+    ],
+    "rank": 443
+  },
+  {
+    "url": "https://openreview.net/forum?id=CGQ6ENUMX6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Deep reinforcement learning primarily focuses on learning behavior, usually overlooking the fact that an agent's function is largely determined by form. So, how should one go about finding a morphology fit for solving tasks in a given environment? Current approaches that co-adapt morphology and behavior use a specific task's reward as a signal for morphology optimization. However, this often requires expensive policy optimization and results in task-dependent morphologies that are not built to generalize. In this work, we propose a new approach, Task-Agnostic Morphology Evolution (TAME), to alleviate both of these issues. Without any task or reward specification, TAME evolves morphologies by only applying randomly sampled action primitives on a population of agents. This is accomplished using an information-theoretic objective that efficiently ranks agents by their ability to reach diverse states in the environment and the causality of their actions. Finally, we empirically demonstrate that across 2D, 3D, and manipulation environments TAME can evolve morphologies that match the multi-task performance of those learned with task supervised algorithms. Our code and videos can be found at <a href=\"https://sites.google.com/view/task-agnostic-evolution\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/task-agnostic-evolution</a> .\n",
+    "title": "Task-Agnostic Morphology Evolution",
+    "authors": [
+      "Donald Joseph Hejna III",
+      "Pieter Abbeel",
+      "Lerrel Pinto"
+    ],
+    "emails": [
+      "~Donald_Joseph_Hejna_III1",
+      "~Pieter_Abbeel2",
+      "~Lerrel_Pinto1"
+    ],
+    "rank": 444
+  },
+  {
+    "url": "https://openreview.net/forum?id=IDFQI9OY6K",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Obtaining large annotated datasets is critical for training successful machine learning models and it is often a bottleneck in practice. Weak supervision offers a promising alternative for producing labeled datasets without ground truth annotations by generating probabilistic labels using multiple noisy heuristics. This process can scale to large datasets and has demonstrated state of the art performance in diverse domains such as healthcare and e-commerce. One practical issue with learning from user-generated heuristics is that their creation requires creativity, foresight, and domain expertise from those who hand-craft them, a process which can be tedious and subjective. We develop the first framework for interactive weak supervision in which a method proposes heuristics and learns from user feedback given on each proposed heuristic. Our experiments demonstrate that only a small number of feedback iterations are needed to train models that achieve highly competitive test set performance without access to ground truth training labels. We conduct user studies, which show that users are able to effectively provide feedback on heuristics and that test set results track the performance of simulated oracles.",
+    "title": "Interactive Weak Supervision: Learning Useful Heuristics for Data Labeling",
+    "authors": [
+      "Benedikt Boecking",
+      "Willie Neiswanger",
+      "Eric Xing",
+      "Artur Dubrawski"
+    ],
+    "emails": [
+      "~Eric_Xing1",
+      "~Benedikt_Boecking1",
+      "~Artur_Dubrawski2",
+      "~Willie_Neiswanger2"
+    ],
+    "rank": 445
+  },
+  {
+    "url": "https://openreview.net/forum?id=edJ_HipawCa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We study how representation learning can improve the efficiency of bandit problems. We study the setting where we play <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> linear bandits with dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> concurrently, and these <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> bandit tasks share a common <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi><mo stretchy=\"false\">(</mo><mo>\u226a</mo><mi>d</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> dimensional linear representation. For the finite-action setting, we present a new algorithm which achieves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.187em; margin-bottom: -0.597em;\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2DC TEX-S1\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.155em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-msqrt space=\"3\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.155em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo>~</mo></mover></mrow><mo stretchy=\"false\">(</mo><mi>T</mi><msqrt><mi>k</mi><mi>N</mi></msqrt><mo>+</mo><msqrt><mi>d</mi><mi>k</mi><mi>N</mi><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> is the number of rounds we play for each bandit. When <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is sufficiently large, our algorithm significantly outperforms the naive algorithm (playing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> bandits independently) that achieves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.187em; margin-bottom: -0.597em;\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2DC TEX-S1\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.156em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo>~</mo></mover></mrow><mo stretchy=\"false\">(</mo><mi>T</mi><msqrt><mi>d</mi><mi>N</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret. We also provide an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.155em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-msqrt space=\"3\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.155em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><mi>T</mi><msqrt><mi>k</mi><mi>N</mi></msqrt><mo>+</mo><msqrt><mi>d</mi><mi>k</mi><mi>N</mi><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret lower bound, showing that our algorithm is minimax-optimal up to poly-logarithmic factors.  Furthermore, we extend our algorithm to the infinite-action setting and obtain a corresponding regret bound which demonstrates the benefit of representation learning in certain regimes. We also present experiments on synthetic and real-world data to illustrate our theoretical findings and demonstrate the effectiveness of our proposed algorithms.",
+    "title": "Impact of Representation Learning in Linear Bandits",
+    "authors": [
+      "Jiaqi Yang",
+      "Wei Hu",
+      "Jason D. Lee",
+      "Simon Shaolei Du"
+    ],
+    "emails": [
+      "~Simon_Shaolei_Du1",
+      "~Jason_D._Lee1",
+      "~Jiaqi_Yang2",
+      "~Wei_Hu1"
+    ],
+    "rank": 446
+  },
+  {
+    "url": "https://openreview.net/forum?id=YbDGyviJkrL",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Transformers are widely used in neural language processing due to their ability to model longer-term dependencies in text. Although these models achieve state-of-the-art performance for many language related tasks, their applicability outside of the neural language processing field has been minimal. In this work, we propose the use of transformer models for the prediction of dynamical systems representative of physical phenomena. The use of Koopman based embeddings provide a unique and powerful method for projecting any dynamical system into a vector representation which can then be predicted by a transformer model. The proposed model is able to accurately predict various dynamical systems and outperform classical methods that are commonly used in the scientific machine learning literature.",
+    "title": "Transformers for Modeling Physical Systems",
+    "authors": [
+      "Nicholas Geneva",
+      "Nicholas Zabaras"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Nicholas_Geneva1"
+    ],
+    "rank": 447
+  },
+  {
+    "url": "https://openreview.net/forum?id=5l9zj5G7vDY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Capturing the structure of a data-generating process by means of appropriate inductive biases can help in learning models that generalise well and are robust to changes in the input distribution. While methods that harness spatial and temporal structures find broad application, recent work has demonstrated the potential of models that leverage sparse and modular structure using an ensemble of sparingly interacting modules. In this work, we take a step towards dynamic models that are capable of simultaneously exploiting both modular and spatiotemporal structures. To this end, we model the dynamical system as a collection of autonomous but sparsely interacting sub-systems that interact according to a learned topology which is informed by the spatial structure of the underlying system. This gives rise to a class of models that are well suited for capturing the dynamics of systems that only offer local views into their state, along with corresponding spatial locations of those views. On the tasks of video prediction from cropped frames and multi-agent world modelling from partial observations in the challenging Starcraft2 domain, we find our models to be more robust to the number of available views and better capable of generalisation to novel tasks without additional training than strong baselines that perform equally well or better on the training distribution. ",
+    "title": "Spatially Structured Recurrent Modules",
+    "authors": [
+      "Nasim Rahaman",
+      "Anirudh Goyal",
+      "Muhammad Waleed Gondal",
+      "Manuel Wuthrich",
+      "Stefan Bauer",
+      "Yash Sharma",
+      "Yoshua Bengio",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Manuel_Wuthrich1",
+      "~Muhammad_Waleed_Gondal1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Stefan_Bauer1",
+      "~Yoshua_Bengio1",
+      "~Nasim_Rahaman1",
+      "~Anirudh_Goyal1",
+      "~Yash_Sharma1"
+    ],
+    "rank": 448
+  },
+  {
+    "url": "https://openreview.net/forum?id=3jjmdp7Hha",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Back-translation is an effective strategy to improve the performance of Neural Machine Translation~(NMT) by generating pseudo-parallel data. However, several recent works have found that better translation quality in the pseudo-parallel data does not necessarily lead to a better final translation model, while lower-quality but diverse data often yields stronger results instead.\nIn this paper we propose a new way to generate pseudo-parallel data for back-translation that directly optimizes the final model performance.  Specifically, we propose a meta-learning framework where the back-translation model learns to match the forward-translation model's gradients on the development data with those on the pseudo-parallel data. In our evaluations in both the standard datasets WMT En-De'14 and WMT En-Fr'14, as well as a multilingual translation setting, our method leads to significant improvements over strong baselines. ",
+    "title": "Meta Back-Translation",
+    "authors": [
+      "Hieu Pham",
+      "Xinyi Wang",
+      "Yiming Yang",
+      "Graham Neubig"
+    ],
+    "emails": [
+      "~Hieu_Pham1",
+      "~Xinyi_Wang1",
+      "~Yiming_Yang1",
+      "~Graham_Neubig1"
+    ],
+    "rank": 449
+  },
+  {
+    "url": "https://openreview.net/forum?id=D3PcGLdMx0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Most recent few-shot learning (FSL) approaches are based on episodic training whereby each episode samples few training instances (shots) per class to imitate the test condition. However, this strict adhering to test condition has a negative side effect, that is, the trained model is susceptible to the poor sampling of few shots. In this work, for the first time, this problem is addressed by exploiting inter-episode relationships. Specifically, a novel meta-learning via modeling episode-level relationships (MELR) framework is proposed. By sampling two episodes containing the same set of classes for meta-training, MELR is designed to ensure that the meta-learned model is robust against the presence of poorly-sampled shots in the meta-test stage. This is achieved through two key components: (1) a Cross-Episode Attention Module (CEAM) to improve the ability of alleviating the effects of poorly-sampled shots, and (2) a Cross-Episode Consistency Regularization (CECR) to enforce that the two classifiers learned from the two episodes are consistent even when there are unrepresentative instances. Extensive experiments for non-transductive standard FSL on two benchmarks show that our MELR achieves 1.0%-5.0% improvements over the baseline (i.e., ProtoNet) used for FSL in our model and outperforms the latest competitors under the same settings.",
+    "title": "MELR: Meta-Learning via Modeling Episode-Level Relationships for Few-Shot Learning",
+    "authors": [
+      "Nanyi Fei",
+      "Zhiwu Lu",
+      "Tao Xiang",
+      "Songfang Huang"
+    ],
+    "emails": [
+      "~Songfang_Huang1",
+      "~Nanyi_Fei1",
+      "~Zhiwu_Lu1",
+      "~Tao_Xiang1"
+    ],
+    "rank": 450
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZzwDy_wiWv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "This paper addresses the problem of model compression via knowledge distillation. We advocate for a method that optimizes the output feature of the penultimate layer of the student network and hence is directly related to representation learning. Previous distillation methods which typically impose direct feature matching between the student and the teacher do not take into account the classification problem at hand. On the contrary, our distillation method decouples representation learning and classification and utilizes the teacher's pre-trained classifier to train the student's penultimate layer feature. In particular, for the same input image, we wish the teacher's and student's feature to produce the same output when passed through the teacher's classifier which is achieved with a simple <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> loss. Our method is extremely simple to implement and straightforward to train and is shown to consistently outperform previous state-of-the-art methods over a large set of experimental settings including different (a) network architectures, (b) teacher-student capacities, (c) datasets, and (d) domains. The code will be available at \\url{<a href=\"https://github.com/jingyang2017/KD_SRRL}\" target=\"_blank\" rel=\"nofollow\">https://github.com/jingyang2017/KD_SRRL}</a>.",
+    "title": "Knowledge distillation via softmax regression representation learning",
+    "authors": [
+      "Jing Yang",
+      "Brais Martinez",
+      "Adrian Bulat",
+      "Georgios Tzimiropoulos"
+    ],
+    "emails": [
+      "~Georgios_Tzimiropoulos1",
+      "~Jing_Yang7",
+      "~Brais_Martinez3",
+      "~Adrian_Bulat1"
+    ],
+    "rank": 451
+  },
+  {
+    "url": "https://openreview.net/forum?id=HOFxeCutxZR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Controllable semantic image editing enables a user to change entire image attributes with a few clicks, e.g., gradually making a summer scene look like it was taken in winter. Classic approaches for this task use a Generative Adversarial Net (GAN) to learn a latent space and suitable latent-space transformations. However, current approaches often suffer from attribute edits that are entangled, global image identity changes, and diminished photo-realism. To address these concerns, we learn multiple attribute transformations simultaneously, integrate attribute regression into the training of transformation functions, and apply a content loss and an adversarial loss that encourages the maintenance of image identity and photo-realism. We propose quantitative evaluation strategies for measuring controllable editing performance, unlike prior work, which primarily focuses on qualitative evaluation. Our model permits better control for both single- and multiple-attribute editing while preserving image identity and realism during transformation. We provide empirical results for both natural and synthetic images, highlighting that our model achieves state-of-the-art performance for targeted image manipulation. ",
+    "title": "Enjoy Your Editing: Controllable GANs for Image Editing via Latent Space Navigation",
+    "authors": [
+      "Peiye Zhuang",
+      "Oluwasanmi O Koyejo",
+      "Alex Schwing"
+    ],
+    "emails": [
+      "~Oluwasanmi_O_Koyejo1",
+      "~Peiye_Zhuang2",
+      "~Alex_Schwing1"
+    ],
+    "rank": 452
+  },
+  {
+    "url": "https://openreview.net/forum?id=8E1-f3VhX1o",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Graph Neural Networks (GNNs) are a predominant technique for learning over graphs. However, there is relatively little understanding of why GNNs are successful in practice and whether they are necessary for good performance. Here, we show that for many standard transductive node classification benchmarks, we can exceed or match the performance of state-of-the-art GNNs by combining shallow models that ignore the graph structure with two simple post-processing steps that exploit correlation in the label structure: (i) an \u201cerror correlation\u201d that spreads residual errors in training data to correct errors in test data and (ii) a \u201cprediction correlation\u201d that smooths the predictions on the test data. We call this overall procedure Correct and Smooth (C&amp;S), and the post-processing steps are implemented via simple modifications to standard label propagation techniques that have long been used in graph-based semi-supervised learning. Our approach exceeds or nearly matches the performance of state-of-the-art GNNs on a wide variety of benchmarks, with just a small fraction of the parameters and orders of magnitude faster runtime. For instance, we exceed the best-known GNN performance on the OGB-Products dataset with 137 times fewer parameters and greater than 100 times less training time. The performance of our methods highlights how directly incorporating label information into the learning algorithm (as is common in traditional methods) yields easy and substantial performance gains. We can also incorporate our techniques into big GNN models, providing modest gains in some cases.",
+    "title": "Combining Label Propagation and Simple Models out-performs Graph Neural Networks",
+    "authors": [
+      "Qian Huang",
+      "Horace He",
+      "Abhay Singh",
+      "Ser-Nam Lim",
+      "Austin Benson"
+    ],
+    "emails": [
+      "cornell.edu",
+      "~Austin_Benson1",
+      "~Ser-Nam_Lim3",
+      "~Horace_He1"
+    ],
+    "rank": 453
+  },
+  {
+    "url": "https://openreview.net/forum?id=t86MwoUCCNe",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": " We consider the problem of distributed mean estimation (DME), in which <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> machines are each given a local <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container>-dimensional vector <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\" noic=\"true\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi mathvariant=\"bold\">x</mi></mrow><mi>v</mi></msub><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup></math></mjx-assistive-mml></mjx-container>, and must cooperate to estimate the mean of their inputs <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D707 TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mfrac space=\"4\"><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-munderover space=\"2\" limits=\"false\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2211 TEX-S1\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: -0.285em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.284em;\"></mjx-spacer><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-munderover><mjx-msub space=\"2\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\" noic=\"true\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi>\u03bc</mi></mrow><mo>=</mo><mfrac><mn>1</mn><mi>n</mi></mfrac><munderover><mo data-mjx-texclass=\"OP\">\u2211</mo><mrow><mi>v</mi><mo>=</mo><mn>1</mn></mrow><mi>n</mi></munderover><msub><mrow><mi mathvariant=\"bold\">x</mi></mrow><mi>v</mi></msub></math></mjx-assistive-mml></mjx-container>, while minimizing total communication cost. DME is a fundamental construct in distributed machine learning, and there has been considerable work on variants of this problem, especially in the context of distributed variance reduction for stochastic gradients in parallel SGD. Previous work typically assumes an upper bound on the norm of the input vectors, and achieves an error bound in terms of this norm. However, in many real applications, the input vectors are concentrated around the correct output <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D707 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi>\u03bc</mi></mrow></math></mjx-assistive-mml></mjx-container>, but <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D707 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi>\u03bc</mi></mrow></math></mjx-assistive-mml></mjx-container> itself has large norm. In such cases, previous output error bounds perform poorly. \n            In this paper, we show that output error bounds need not depend on input norm. We provide a method of quantization which allows distributed mean estimation to be performed with solution quality dependent only on the distance between inputs, not on input norm, and show an analogous result for distributed variance reduction. The technique is based on a new connection with lattice theory. We also provide lower bounds showing that the communication to error trade-off of our algorithms is asymptotically optimal. As the lattices achieving optimal bounds under <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-norm can be computationally impractical, we also present an extension which leverages  easy-to-use cubic lattices, and is loose only up to a logarithmic factor in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container>. We show experimentally that our method yields practical improvements for common applications, relative to prior approaches.  ",
+    "title": "New Bounds For Distributed Mean Estimation and Variance Reduction",
+    "authors": [
+      "Peter Davies",
+      "Vijaykrishna Gurunanthan",
+      "Niusha Moshrefi",
+      "Saleh Ashkboos",
+      "Dan Alistarh"
+    ],
+    "emails": [
+      "~Dan_Alistarh7",
+      "gmail.com",
+      "ist.ac.at"
+    ],
+    "rank": 454
+  },
+  {
+    "url": "https://openreview.net/forum?id=60j5LygnmD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Deep learning models require a large amount of data to perform well. When data is scarce for a target task, we can transfer the knowledge gained by training on similar tasks to quickly learn the target. A successful approach is meta-learning, or \"learning to learn\" a distribution of tasks, where \"learning\" is represented by an outer loop, and \"to learn\" by an inner loop of gradient descent. However, a number of recent empirical studies argue that the inner loop is unnecessary and more simple models work equally well or even better. We study the performance of MAML as a function of the learning rate of the inner loop, where zero learning rate implies that there is no inner loop. Using random matrix theory and exact solutions of linear models, we calculate an algebraic expression for the test loss of MAML applied to mixed linear regression and nonlinear regression with overparameterized models. Surprisingly, while the optimal learning rate for adaptation is positive, we find that the optimal learning rate for training is always negative, a setting that has never been considered before. Therefore, not only does the performance increase by decreasing the learning rate to zero, as suggested by recent work, but it can be increased even further by decreasing the learning rate to negative\nvalues. These results help clarify under what circumstances meta-learning performs best.",
+    "title": "Meta-learning with negative learning rates",
+    "authors": [
+      "Alberto Bernacchia"
+    ],
+    "emails": [
+      "~Alberto_Bernacchia1"
+    ],
+    "rank": 455
+  },
+  {
+    "url": "https://openreview.net/forum?id=g11CZSghXyY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      8,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Ensemble methods which average over multiple neural network predictions are a simple approach to improve a model\u2019s calibration and robustness. Similarly, data augmentation techniques, which encode prior information in the form of invariant feature transformations, are effective for improving calibration and robustness. In this paper, we show a surprising pathology: combining ensembles and data augmentation can harm model calibration. This leads to a trade-off in practice, whereby improved accuracy by combining the two techniques comes at the expense of calibration. On the other hand, selecting only one of the techniques ensures good uncertainty estimates at the expense of accuracy. We investigate this pathology and identify a compounding under-confidence among methods which marginalize over sets of weights and data augmentation techniques which soften labels. Finally, we propose a simple correction, achieving the best of both worlds with significant accuracy and calibration gains over using only ensembles or data augmentation individually. Applying the correction produces new state-of-the art in uncertainty calibration and robustness across CIFAR-10, CIFAR-100, and ImageNet.",
+    "title": "Combining Ensembles and Data Augmentation Can Harm Your Calibration",
+    "authors": [
+      "Yeming Wen",
+      "Ghassen Jerfel",
+      "Rafael Muller",
+      "Michael W Dusenberry",
+      "Jasper Snoek",
+      "Balaji Lakshminarayanan",
+      "Dustin Tran"
+    ],
+    "emails": [
+      "~Rafael_Muller1",
+      "~Dustin_Tran1",
+      "~Michael_W_Dusenberry1",
+      "~Balaji_Lakshminarayanan1",
+      "~Yeming_Wen1",
+      "~Jasper_Snoek1",
+      "~Ghassen_Jerfel1"
+    ],
+    "rank": 456
+  },
+  {
+    "url": "https://openreview.net/forum?id=pmj131uIL9H",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "3D pose estimation is a challenging but important task in computer vision. In this work, we show that standard deep learning approaches to 3D pose estimation are not robust to partial occlusion. Inspired by the robustness of generative vision models to partial occlusion, we propose to integrate deep neural networks with 3D generative representations of objects into a unified neural architecture that we term NeMo. In particular, NeMo learns a generative model of neural feature activations at each vertex on a dense 3D mesh. Using differentiable rendering we estimate the 3D object pose by minimizing the reconstruction error between NeMo and the feature representation of the target image. To avoid local optima in the reconstruction loss, we train the feature extractor to maximize the distance between the individual feature representations on the mesh using contrastive learning. Our extensive experiments on PASCAL3D+, occluded-PASCAL3D+ and ObjectNet3D show that NeMo is much more robust to partial occlusion compared to standard deep networks, while retaining competitive performance on non-occluded data. Interestingly, our experiments also show that NeMo performs reasonably well even when the mesh representation only crudely approximates the true object geometry with a cuboid, hence revealing that the detailed 3D geometry is not needed for accurate 3D pose estimation.",
+    "title": "NeMo: Neural Mesh Models of Contrastive Features for Robust 3D Pose Estimation",
+    "authors": [
+      "Angtian Wang",
+      "Adam Kortylewski",
+      "Alan Yuille"
+    ],
+    "emails": [
+      "~Alan_Yuille1",
+      "~Adam_Kortylewski1",
+      "~Angtian_Wang2"
+    ],
+    "rank": 457
+  },
+  {
+    "url": "https://openreview.net/forum?id=PObuuGVrGaZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.53",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "This work aims to empirically clarify a recently discovered perspective that label smoothing is incompatible with knowledge distillation. We begin by introducing the motivation behind on how this incompatibility is raised, i.e., label smoothing erases relative information between teacher logits. We provide a novel connection on how label smoothing affects distributions of semantically similar and dissimilar classes. Then we propose a metric to quantitatively measure the degree of erased information in sample's representation. After that, we study its one-sidedness and imperfection of the incompatibility view through massive analyses, visualizations and comprehensive experiments on Image Classification, Binary Networks, and Neural Machine Translation. Finally, we broadly discuss several circumstances wherein label smoothing will indeed lose its effectiveness.",
+    "title": "Is Label Smoothing Truly Incompatible with Knowledge Distillation: An Empirical Study",
+    "authors": [
+      "Zhiqiang Shen",
+      "Zechun Liu",
+      "Dejia Xu",
+      "Zitian Chen",
+      "Kwang-Ting Cheng",
+      "Marios Savvides"
+    ],
+    "emails": [
+      "~Marios_Savvides1",
+      "~Dejia_Xu1",
+      "~Kwang-Ting_Cheng1",
+      "~Zhiqiang_Shen1",
+      "~Zitian_Chen1",
+      "~Zechun_Liu1"
+    ],
+    "rank": 458
+  },
+  {
+    "url": "https://openreview.net/forum?id=--gvHfE3Xf5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "In recent years, meta-learning, in which a model is trained on a family of tasks (i.e. a task distribution), has emerged as an approach to training neural networks to perform tasks that were previously assumed to require structured representations, making strides toward closing the gap between humans and machines. However, we argue that evaluating meta-learning remains a challenge, and can miss whether meta-learning actually uses the structure embedded within the tasks. These meta-learners might therefore still be significantly different from humans learners. To demonstrate this difference, we first define a new meta-reinforcement learning task in which a structured task distribution is generated using a compositional grammar. We then introduce a novel approach to constructing a \"null task distribution\" with the same statistical complexity as this structured task distribution but without the explicit rule-based structure used to generate the structured task. We train a standard meta-learning agent, a recurrent network trained with model-free reinforcement learning, and compare it with human performance across the two task distributions. We find a double dissociation in which humans do better in the structured task distribution whereas agents do better in the null task distribution -- despite comparable statistical complexity. This work highlights that multiple strategies can achieve reasonable meta-test performance, and that careful construction of control task distributions is a valuable way to understand which strategies meta-learners acquire, and how they might differ from humans. ",
+    "title": "Meta-Learning of Structured Task Distributions in Humans and Machines",
+    "authors": [
+      "Sreejan Kumar",
+      "Ishita Dasgupta",
+      "Jonathan Cohen",
+      "Nathaniel Daw",
+      "Thomas Griffiths"
+    ],
+    "emails": [
+      "~Sreejan_Kumar1",
+      "~Jonathan_Cohen1",
+      "~Thomas_Griffiths1",
+      "~Nathaniel_Daw1",
+      "~Ishita_Dasgupta1"
+    ],
+    "rank": 459
+  },
+  {
+    "url": "https://openreview.net/forum?id=TNkPBBYFkXg",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.53",
+    "confidences": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Federated Learning (FL) is a method of training machine learning models on private data distributed over a large number of possibly heterogeneous clients such as mobile phones and IoT devices. In this work, we propose a new federated learning framework named HeteroFL to address heterogeneous clients equipped with very different computation and communication capabilities. Our solution can enable the training of heterogeneous local models with varying computation complexities and still produce a single global inference model. For the first time, our method challenges the underlying assumption of existing work that local models have to share the same architecture as the global model. We demonstrate several strategies to enhance FL training and conduct extensive empirical evaluations, including five computation complexity levels of three model architecture on three datasets. We show that adaptively distributing subnetworks according to clients' capabilities is both computation and communication efficient.",
+    "title": "HeteroFL: Computation and Communication Efficient Federated Learning for Heterogeneous Clients",
+    "authors": [
+      "Enmao Diao",
+      "Jie Ding",
+      "Vahid Tarokh"
+    ],
+    "emails": [
+      "~Jie_Ding2",
+      "~Vahid_Tarokh1",
+      "~Enmao_Diao1"
+    ],
+    "rank": 460
+  },
+  {
+    "url": "https://openreview.net/forum?id=oFp8Mx_V5FL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural data compression has been shown to outperform classical methods in terms of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi><mi>D</mi></math></mjx-assistive-mml></mjx-container> performance, with results still improving rapidly.\nAt a high level, neural compression is based on an autoencoder that tries to reconstruct the input instance from a (quantized) latent representation, coupled with a prior that is used to losslessly compress these latents.\nDue to limitations on model capacity and imperfect optimization and generalization, such models will suboptimally compress test data in general.\nHowever, one of the great strengths of learned compression is that if the test-time data distribution is known and relatively low-entropy (e.g. a camera watching a static scene, a dash cam in an autonomous car, etc.), the model can easily be finetuned or adapted to this distribution, leading to improved <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi><mi>D</mi></math></mjx-assistive-mml></mjx-container> performance.\nIn this paper we take this concept to the extreme, adapting the full model to a single video, and sending model updates (quantized and compressed using a parameter-space prior) along with the latent representation. Unlike previous work, we finetune not only the encoder/latents but the entire model, and - during finetuning - take into account both the effect of model quantization and the additional costs incurred by sending the model updates. We evaluate an image compression model on I-frames (sampled at 2 fps) from videos of the Xiph dataset, and demonstrate that full-model adaptation improves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi><mi>D</mi></math></mjx-assistive-mml></mjx-container> performance by ~1 dB, with respect to encoder-only finetuning.",
+    "title": "Overfitting for Fun and Profit: Instance-Adaptive Data Compression",
+    "authors": [
+      "Ties van Rozendaal",
+      "Iris AM Huijben",
+      "Taco Cohen"
+    ],
+    "emails": [
+      "~Ties_van_Rozendaal1",
+      "qti.qualcomm.com",
+      "~Taco_Cohen1"
+    ],
+    "rank": 461
+  },
+  {
+    "url": "https://openreview.net/forum?id=xCxXwTzx4L1",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Structured pruning methods are among the effective strategies for extracting small resource-efficient convolutional neural networks from their dense counterparts with minimal loss in accuracy. However, most existing methods still suffer from one or more limitations, that include 1) the need for training the dense model from scratch with pruning-related parameters embedded in the architecture, 2) requiring model-specific hyperparameter settings, 3) inability to include budget-related constraint in the training process, and 4) instability under scenarios of extreme pruning. In this paper, we present ChipNet, a deterministic pruning strategy that employs continuous Heaviside function and a novel crispness loss to identify a highly sparse network out of an existing dense network. Our choice of continuous Heaviside function is inspired by the field of design optimization, where the material distribution task is posed as a continuous optimization problem, but only discrete values (0 or 1) are practically feasible and expected as final outcomes. Our approach's flexible design facilitates its use with different choices of budget constraints while maintaining stability for very low target budgets. Experimental results show that ChipNet outperforms state-of-the-art structured pruning methods by remarkable margins of up to 16.1% in terms of accuracy. Further, we show that the masks obtained with ChipNet are transferable across datasets. For certain cases, it was observed that masks transferred from a model trained on feature-rich teacher dataset provide better performance on the student dataset than those obtained by directly pruning on the student data itself.",
+    "title": "ChipNet: Budget-Aware Pruning with Heaviside Continuous Approximations",
+    "authors": [
+      "Rishabh Tiwari",
+      "Udbhav Bamba",
+      "Arnav Chavan",
+      "Deepak Gupta"
+    ],
+    "emails": [
+      "~Deepak_Gupta2",
+      "~Arnav_Chavan1",
+      "~Udbhav_Bamba1",
+      "~Rishabh_Tiwari1"
+    ],
+    "rank": 462
+  },
+  {
+    "url": "https://openreview.net/forum?id=dyjPVUc2KB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In this paper we consider reinforcement learning tasks with progressive rewards; that is, tasks where the rewards tend to increase in magnitude over time. We hypothesise that this property may be problematic for value-based deep reinforcement learning agents, particularly if the agent must first succeed in relatively unrewarding regions of the task in order to reach more rewarding regions. To address this issue, we propose Spectral DQN, which decomposes the reward into frequencies such that the high frequencies only activate when large rewards are found. This allows the training loss to be balanced so that it gives more even weighting across small and large reward regions. In two domains with extreme reward progressivity, where standard value-based methods struggle significantly, Spectral DQN is able to make much farther progress. Moreover, when evaluated on a set of six standard Atari games that do not overtly favour the approach, Spectral DQN remains more than competitive: While it underperforms one of the benchmarks in a single game, it comfortably surpasses the benchmarks in three games. These results demonstrate that the approach is not overfit to its target problem, and suggest that Spectral DQN may have advantages beyond addressing reward progressivity.",
+    "title": "Adapting to Reward Progressivity via Spectral Reinforcement Learning",
+    "authors": [
+      "Michael Dann",
+      "John Thangarajah"
+    ],
+    "emails": [
+      "~Michael_Dann1",
+      "rmit.edu.au"
+    ],
+    "rank": 463
+  },
+  {
+    "url": "https://openreview.net/forum?id=vYeQQ29Tbvx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "A wide variety of deep learning techniques from style transfer to multitask learning rely on training affine transformations of features. Most prominent among these is the popular feature normalization technique BatchNorm, which normalizes activations and then subsequently applies a learned affine transform. In this paper, we aim to understand the role and expressive power of affine parameters used to transform features in this way. To isolate the contribution of these parameters from that of the learned features they transform, we investigate the performance achieved when training only these parameters in BatchNorm and freezing all weights at their random initializations. Doing so leads to surprisingly high performance considering the significant limitations that this style of training imposes. For example, sufficiently deep ResNets reach 82% (CIFAR-10) and 32% (ImageNet, top-5) accuracy in this configuration, far higher than when training an equivalent number of randomly chosen parameters elsewhere in the network. BatchNorm achieves this performance in part by naturally learning to disable around a third of the random features. Not only do these results highlight the expressive power of affine parameters in deep learning, but - in a broader sense - they characterize the expressive power of neural networks constructed simply by shifting and rescaling random features.",
+    "title": "Training BatchNorm and Only BatchNorm: On the Expressive Power of Random Features in CNNs",
+    "authors": [
+      "Jonathan Frankle",
+      "David J. Schwab",
+      "Ari S. Morcos"
+    ],
+    "emails": [
+      "~David_J._Schwab1",
+      "~Jonathan_Frankle1",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 464
+  },
+  {
+    "url": "https://openreview.net/forum?id=MmCRswl1UYl",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In open question answering (QA), the answer to a question is produced by retrieving and then analyzing documents that might contain answers to the question.  Most open QA systems have considered only retrieving information from unstructured text.  Here we consider for the first time open QA over {\\em both} tabular and textual data and present a new large-scale dataset \\emph{Open Table-and-Text Question Answering} (OTT-QA) to evaluate performance on this task. Most questions in OTT-QA require multi-hop inference across tabular data and unstructured text, and the evidence required to answer a question can be distributed in different ways over these two types of input, making evidence retrieval challenging---our baseline model using an iterative retriever and BERT-based reader achieves an exact match score less than 10\\%. We then propose two novel techniques to address the challenge of retrieving and aggregating evidence for OTT-QA. The first technique is to use ``early fusion'' to group multiple highly relevant tabular and textual units into a fused block, which provides more context for the retriever to search for.  The second technique is to use a cross-block reader to model the cross-dependency between multiple retrieved evidence with global-local sparse attention. Combining these two techniques improves the score significantly, to above 27\\%.",
+    "title": "Open Question Answering over Tables and Text",
+    "authors": [
+      "Wenhu Chen",
+      "Ming-Wei Chang",
+      "Eva Schlinger",
+      "William Yang Wang",
+      "William W. Cohen"
+    ],
+    "emails": [
+      "~Ming-Wei_Chang3",
+      "~William_W._Cohen2",
+      "~Wenhu_Chen3",
+      "~Eva_Schlinger2",
+      "~William_Yang_Wang2"
+    ],
+    "rank": 465
+  },
+  {
+    "url": "https://openreview.net/forum?id=zx_uX-BO7CH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Continual learning methods with fixed architectures rely on a single network to learn models that can perform well on all tasks.\nAs a result, they often only accommodate common features of those tasks but neglect each task's specific features. On the other hand, dynamic architecture methods can have a separate network for each task, but they are too expensive to train and not scalable in practice, especially in online settings.\nTo address this problem, we propose a novel online continual learning method named ``Contextual Transformation Networks\u201d (CTN) to efficiently model the \\emph{task-specific features} while enjoying neglectable complexity overhead compared to other fixed architecture methods. \nMoreover, inspired by the Complementary Learning Systems (CLS) theory, we propose a novel dual memory design and an objective to train CTN that can address both catastrophic forgetting and knowledge transfer simultaneously. \nOur extensive experiments show that CTN is competitive with a large scale dynamic architecture network and consistently outperforms other fixed architecture methods under the same standard backbone. Our implementation can be found at \\url{<a href=\"https://github.com/phquang/Contextual-Transformation-Network}\" target=\"_blank\" rel=\"nofollow\">https://github.com/phquang/Contextual-Transformation-Network}</a>.",
+    "title": "Contextual Transformation Networks for Online Continual Learning",
+    "authors": [
+      "Quang Pham",
+      "Chenghao Liu",
+      "Doyen Sahoo",
+      "Steven HOI"
+    ],
+    "emails": [
+      "~Steven_HOI1",
+      "~Quang_Pham1",
+      "~Doyen_Sahoo1",
+      "~Chenghao_Liu1"
+    ],
+    "rank": 466
+  },
+  {
+    "url": "https://openreview.net/forum?id=3tFAs5E-Pe",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Wasserstein barycenters provide a geometric notion of the weighted average of probability measures based on optimal transport. In this paper, we present a scalable algorithm to compute Wasserstein-2 barycenters given sample access to the input measures, which are not restricted to being discrete. While past approaches rely on entropic or quadratic regularization, we employ input convex neural networks and cycle-consistency regularization to avoid introducing bias. As a result, our approach does not resort to minimax optimization. We provide theoretical analysis on error bounds as well as empirical evidence of the effectiveness of the proposed approach in low-dimensional qualitative scenarios and high-dimensional quantitative experiments.",
+    "title": "Continuous Wasserstein-2 Barycenter Estimation without Minimax Optimization",
+    "authors": [
+      "Alexander Korotin",
+      "Lingxiao Li",
+      "Justin Solomon",
+      "Evgeny Burnaev"
+    ],
+    "emails": [
+      "~Evgeny_Burnaev1",
+      "~Justin_Solomon1",
+      "mit.edu",
+      "~Alexander_Korotin2"
+    ],
+    "rank": 467
+  },
+  {
+    "url": "https://openreview.net/forum?id=5jRVa89sZk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In many scenarios, named entity recognition (NER) models severely suffer from unlabeled entity problem, where the entities of a sentence may not be fully annotated. Through empirical studies performed on synthetic datasets, we find two causes of performance degradation. One is the reduction of annotated entities and the other is treating unlabeled entities as negative instances. The first cause has less impact than the second one and can be mitigated by adopting pretraining language models. The second cause seriously misguides a model in training and greatly affects its performances. Based on the above observations, we propose a general approach, which can almost eliminate the misguidance brought by unlabeled entities. The key idea is to use negative sampling that, to a large extent, avoids training NER models with unlabeled entities. Experiments on synthetic datasets and real-world datasets show that our model is robust to unlabeled entity problem and surpasses prior baselines. On well-annotated datasets, our model is competitive with the state-of-the-art method.",
+    "title": "Empirical Analysis of Unlabeled Entity Problem in Named Entity Recognition",
+    "authors": [
+      "Yangming Li",
+      "lemao liu",
+      "Shuming Shi"
+    ],
+    "emails": [
+      "~Yangming_Li1",
+      "~lemao_liu1",
+      "~Shuming_Shi1"
+    ],
+    "rank": 468
+  },
+  {
+    "url": "https://openreview.net/forum?id=Cri3xz59ga",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "This article provides theoretical insights into the inner workings of multi-task and transfer learning methods, by studying the tractable least-square support vector machine multi-task learning (LS-SVM MTL) method, in the limit of large (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container>) and numerous (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>) data. By a random matrix analysis applied to a Gaussian mixture data model, the performance of MTL LS-SVM is shown to converge, as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>,</mo><mi>p</mi><mo accent=\"false\" stretchy=\"false\">\u2192</mo><mi mathvariant=\"normal\">\u221e</mi></math></mjx-assistive-mml></mjx-container>, to a deterministic limit involving simple (small-dimensional) statistics of the data.\n\nWe prove (i) that the standard MTL LS-SVM algorithm is in general strongly biased and may dramatically fail (to the point that individual single-task LS-SVMs may outperform the MTL approach, even for quite resembling tasks): our analysis provides a simple method to correct these biases, and that we reveal (ii) the sufficient statistics at play in the method, which can be efficiently estimated, even for quite small datasets. The latter result is exploited to automatically optimize the hyperparameters without resorting to any cross-validation procedure. \n\nExperiments on popular datasets demonstrate that our improved MTL LS-SVM method is computationally-efficient and outperforms sometimes much more elaborate state-of-the-art multi-task and transfer learning techniques.",
+    "title": "Deciphering and Optimizing Multi-Task Learning: a Random Matrix Approach",
+    "authors": [
+      "Malik Tiomoko",
+      "Hafiz Tiomoko Ali",
+      "Romain Couillet"
+    ],
+    "emails": [
+      "~Malik_Tiomoko1",
+      "~Hafiz_Tiomoko_Ali1",
+      "~Romain_Couillet1"
+    ],
+    "rank": 469
+  },
+  {
+    "url": "https://openreview.net/forum?id=3SqrRe8FWQ-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Low-precision neural networks represent both weights and activations with few bits, drastically reducing the cost of multiplications. Meanwhile, these products are accumulated using high-precision (typically 32-bit) additions.  Additions dominate the arithmetic complexity of inference in quantized (e.g., binary) nets, and high precision is needed to avoid overflow. To further optimize inference, we propose WrapNet, an architecture that adapts neural networks to use low-precision (8-bit) additions while achieving classification accuracy comparable to their 32-bit counterparts. We achieve resilience to low-precision accumulation by inserting a cyclic activation layer that makes results invariant to overflow. We demonstrate the efficacy of our approach using both software and hardware platforms.",
+    "title": "WrapNet:  Neural Net Inference with Ultra-Low-Precision Arithmetic",
+    "authors": [
+      "Renkun Ni",
+      "Hong-min Chu",
+      "Oscar Castaneda",
+      "Ping-yeh Chiang",
+      "Christoph Studer",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Oscar_Castaneda1",
+      "~Christoph_Studer1",
+      "cs.umd.edu",
+      "~Ping-yeh_Chiang1",
+      "~Renkun_Ni1",
+      "~Tom_Goldstein1"
+    ],
+    "rank": 470
+  },
+  {
+    "url": "https://openreview.net/forum?id=kW_zpEmMLdP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The existing Neural ODE formulation relies on an explicit knowledge of the termination time. We extend Neural ODEs to implicitly defined termination criteria modeled by neural event functions, which can be chained together and differentiated through. Neural Event ODEs are capable of modeling discrete and instantaneous changes in a continuous-time system, without prior knowledge of when these changes should occur or how many such changes should exist. We test our approach in modeling hybrid discrete- and continuous- systems such as switching dynamical systems and collision in multi-body systems, and we propose simulation-based training of point processes with applications in discrete control.",
+    "title": "Learning Neural Event Functions for Ordinary Differential Equations",
+    "authors": [
+      "Ricky T. Q. Chen",
+      "Brandon Amos",
+      "Maximilian Nickel"
+    ],
+    "emails": [
+      "~Ricky_T._Q._Chen1",
+      "~Brandon_Amos1",
+      "~Maximilian_Nickel1"
+    ],
+    "rank": 471
+  },
+  {
+    "url": "https://openreview.net/forum?id=IqtonxWI0V3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Deep neural networks with rectified linear (ReLU) activations are piecewise linear functions, where hyperplanes partition the input space into an astronomically high number of linear regions. Previous work focused on counting linear regions to measure the network's expressive power and on analyzing geometric properties of the hyperplane configurations. In contrast, we aim to understand the impact of the linear terms on network performance, by examining the information encoded in their coefficients. To this end, we derive TropEx, a nontrivial tropical algebra-inspired algorithm to systematically extract linear terms based on data. Applied to convolutional and fully-connected networks, our algorithm uncovers significant differences in how the different networks utilize linear regions for generalization. This underlines the importance of systematic linear term exploration, to better understand generalization in neural networks trained with complex data sets.",
+    "title": "TropEx: An Algorithm for Extracting Linear Terms in Deep Neural Networks",
+    "authors": [
+      "Martin Trimmel",
+      "Henning Petzka",
+      "Cristian Sminchisescu"
+    ],
+    "emails": [
+      "~Cristian_Sminchisescu1",
+      "~Martin_Trimmel1",
+      "~Henning_Petzka1"
+    ],
+    "rank": 472
+  },
+  {
+    "url": "https://openreview.net/forum?id=qda7-sVg84",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Reinforcement learning methods trained on few environments rarely learn policies that generalize to unseen environments. To improve generalization, we incorporate the inherent sequential structure in reinforcement learning into the representation learning process. This approach is orthogonal to recent approaches, which rarely exploit this structure explicitly. Specifically, we introduce a theoretically motivated policy similarity metric (PSM) for measuring behavioral similarity between states. PSM assigns high similarity to states for which the optimal policies in those states as well as in future states are similar. We also present a contrastive representation learning procedure to embed any state similarity metric, which we instantiate with PSM to obtain policy similarity embeddings (PSEs). We demonstrate that PSEs improve generalization on diverse benchmarks, including LQR with spurious correlations, a jumping task from pixels, and Distracting DM Control Suite. ",
+    "title": "Contrastive Behavioral Similarity Embeddings for Generalization in Reinforcement Learning",
+    "authors": [
+      "Rishabh Agarwal",
+      "Marlos C. Machado",
+      "Pablo Samuel Castro",
+      "Marc G Bellemare"
+    ],
+    "emails": [
+      "~Pablo_Samuel_Castro1",
+      "~Marc_G_Bellemare1",
+      "~Marlos_C._Machado1",
+      "~Rishabh_Agarwal2"
+    ],
+    "rank": 473
+  },
+  {
+    "url": "https://openreview.net/forum?id=jP1vTH3inC",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      5,
+      2,
+      4
+    ],
+    "abstract": "The predominant approach for language modeling is to encode a sequence of tokens from left to right, but this eliminates a source of information: the order by which the sequence was naturally generated. One strategy to recover this information is to decode both the content and ordering of tokens. Some prior work supervises content and ordering with hand-designed loss functions to encourage specific orders or bootstraps from a predefined ordering. These approaches require domain-specific insight. Other prior work searches over valid insertion operations that lead to ground truth sequences during training, which has high time complexity and cannot be efficiently parallelized. We address these limitations with an unsupervised learner that can be trained in a fully-parallelizable manner to discover high-quality autoregressive orders in a data driven way without a domain-specific prior. The learner is a neural network that performs variational inference with the autoregressive ordering as a latent variable. Since the corresponding variational lower bound is not differentiable, we develop a practical algorithm for end-to-end optimization using policy gradients. Strong empirical results with our solution on sequence modeling tasks suggest that our algorithm is capable of discovering various autoregressive orders for different sequences that are competitive with or even better than fixed orders.",
+    "title": "Discovering Non-monotonic Autoregressive Orderings with Variational Inference",
+    "authors": [
+      "Xuanlin Li",
+      "Brandon Trabucco",
+      "Dong Huk Park",
+      "Michael Luo",
+      "Sheng Shen",
+      "Trevor Darrell",
+      "Yang Gao"
+    ],
+    "emails": [
+      "~Brandon_Trabucco1",
+      "~Dong_Huk_Park2",
+      "~Michael_Luo2",
+      "~Xuanlin_Li1",
+      "~Trevor_Darrell2",
+      "~Yang_Gao1",
+      "~Sheng_Shen2"
+    ],
+    "rank": 474
+  },
+  {
+    "url": "https://openreview.net/forum?id=UwGY2qjqoLD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      8,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "In the present work we study classifiers' decision boundaries via Brownian motion processes in ambient data space and associated probabilistic techniques. Intuitively, our ideas correspond to placing a heat source at the decision boundary and observing how effectively the sample points warm up. We are largely motivated by the search for a soft measure that sheds further light on the decision boundary's geometry. En route, we  bridge aspects of potential theory and geometric analysis (Maz'ya 2011, Grigor'Yan and Saloff-Coste 2002) with active fields of ML research such as adversarial examples and generalization bounds. First, we focus on the geometric behavior of decision boundaries in the light of adversarial attack/defense mechanisms. Experimentally, we observe a certain capacitory trend over different adversarial defense strategies: decision boundaries locally become flatter as measured by isoperimetric inequalities (Ford et al 2019); however, our more sensitive heat-diffusion metrics  extend this analysis and further reveal that some non-trivial geometry invisible to plain distance-based methods is still preserved. Intuitively, we provide evidence that the decision boundaries nevertheless retain many persistent \"wiggly and fuzzy\" regions on a finer scale.\nSecond, we show how Brownian hitting probabilities translate to soft generalization bounds which are in turn connected to compression and noise stability (Arora et al 2018), and these bounds are significantly stronger if the decision boundary has controlled geometric features.",
+    "title": "Heating up decision boundaries: isocapacitory saturation, adversarial scenarios and generalization bounds",
+    "authors": [
+      "Bogdan Georgiev",
+      "Lukas Franken",
+      "Mayukh Mukherjee"
+    ],
+    "emails": [
+      "~Mayukh_Mukherjee1",
+      "~Bogdan_Georgiev1",
+      "~Lukas_Franken1"
+    ],
+    "rank": 475
+  },
+  {
+    "url": "https://openreview.net/forum?id=CZ8Y3NzuVzO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      6,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recent self-supervised contrastive methods have been able to produce impressive transferable visual representations by learning to be invariant to different data augmentations. However, these methods implicitly assume a particular set of representational invariances (e.g., invariance to color), and can perform poorly when a downstream task violates this assumption (e.g., distinguishing red vs. yellow cars). We introduce a contrastive learning framework which does not require prior knowledge of specific, task-dependent invariances. Our model learns to capture varying and invariant factors for visual representations by constructing separate embedding spaces, each of which is invariant to all but one augmentation. We use a multi-head network with a shared backbone which captures information across each augmentation and alone outperforms all baselines on downstream tasks. We further find that the concatenation of the invariant and varying spaces performs best across all tasks we investigate, including coarse-grained, fine-grained, and few-shot downstream classification tasks, and various data corruptions.",
+    "title": "What Should Not Be Contrastive in Contrastive Learning",
+    "authors": [
+      "Tete Xiao",
+      "Xiaolong Wang",
+      "Alexei A Efros",
+      "Trevor Darrell"
+    ],
+    "emails": [
+      "~Xiaolong_Wang3",
+      "~Alexei_A_Efros1",
+      "~Trevor_Darrell2",
+      "~Tete_Xiao1"
+    ],
+    "rank": 476
+  },
+  {
+    "url": "https://openreview.net/forum?id=NSBrFgJAHg",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      2,
+      4,
+      2
+    ],
+    "abstract": "Graph neural networks (GNNs) have demonstrated strong performance on a wide variety of tasks due to their ability to model non-uniform structured data. Despite their promise, there exists little research exploring methods to make them more efficient at inference time. In this work, we explore the viability of training quantized GNNs, enabling the usage of low precision integer arithmetic during inference. For GNNs seemingly unimportant choices in quantization implementation cause dramatic changes in performance. We identify the sources of error that uniquely arise when attempting to quantize GNNs, and propose an architecturally-agnostic and stable method, Degree-Quant, to improve performance over existing quantization-aware training baselines commonly used on other architectures, such as CNNs. We validate our method on six datasets and show, unlike previous quantization attempts, that models generalize to unseen graphs. Models trained with Degree-Quant for INT8 quantization perform as well as FP32 models in most cases; for INT4 models, we obtain up to 26% gains over the baselines. Our work enables up to 4.7x speedups on CPU when using INT8 arithmetic.",
+    "title": "Degree-Quant: Quantization-Aware Training for Graph Neural Networks",
+    "authors": [
+      "Shyam Anil Tailor",
+      "Javier Fernandez-Marques",
+      "Nicholas Donald Lane"
+    ],
+    "emails": [
+      "~Javier_Fernandez-Marques1",
+      "~Nicholas_Donald_Lane1",
+      "~Shyam_Anil_Tailor1"
+    ],
+    "rank": 477
+  },
+  {
+    "url": "https://openreview.net/forum?id=_TM6rT7tXke",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recently, various auxiliary tasks have been proposed to accelerate representation learning and improve sample efficiency in deep reinforcement learning (RL). However, existing auxiliary tasks do not take the characteristics of RL problems into consideration and are unsupervised. By leveraging returns, the most important feedback signals in RL, we propose a novel auxiliary task that forces the learnt representations to discriminate state-action pairs with different returns. Our auxiliary loss is theoretically justified to learn representations that capture the structure of a new form of state-action abstraction, under which state-action pairs with similar return distributions are aggregated together. Empirically, our algorithm outperforms strong baselines on complex tasks in Atari games and DeepMind Control suite, and achieves even better performance when combined with existing auxiliary tasks.",
+    "title": "Return-Based Contrastive Representation Learning for Reinforcement  Learning",
+    "authors": [
+      "Guoqing Liu",
+      "Chuheng Zhang",
+      "Li Zhao",
+      "Tao Qin",
+      "Jinhua Zhu",
+      "Li Jian",
+      "Nenghai Yu",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Tie-Yan_Liu1",
+      "~Guoqing_Liu3",
+      "~Chuheng_Zhang1",
+      "~Li_Jian1",
+      "~Jinhua_Zhu1",
+      "~Nenghai_Yu1",
+      "~Li_Zhao1"
+    ],
+    "rank": 478
+  },
+  {
+    "url": "https://openreview.net/forum?id=te7PVH1sPxJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      7,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Flow-based models are powerful tools for designing probabilistic models with tractable density. This paper introduces Convex Potential Flows (CP-Flow), a natural and efficient parameterization of invertible models inspired by the optimal transport (OT) theory. CP-Flows are the gradient map of a strongly convex neural potential function. The convexity implies invertibility and allows us to resort to convex optimization to solve the convex conjugate for efficient inversion. To enable maximum likelihood training, we derive a new gradient estimator of the log-determinant of the Jacobian, which involves solving an inverse-Hessian vector product using the conjugate gradient method. The gradient estimator has constant-memory cost, and can be made effectively unbiased by reducing the error tolerance level of the convex optimization routine. Theoretically, we prove that CP-Flows are universal density approximators and are optimal in the OT sense. Our empirical results show that CP-Flow performs competitively on standard benchmarks of density estimation and variational inference.",
+    "title": "Convex Potential Flows: Universal Probability Distributions with Optimal Transport and Convex Optimization",
+    "authors": [
+      "Chin-Wei Huang",
+      "Ricky T. Q. Chen",
+      "Christos Tsirigotis",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Chin-Wei_Huang1",
+      "~Aaron_Courville3",
+      "~Christos_Tsirigotis1",
+      "~Ricky_T._Q._Chen1"
+    ],
+    "rank": 479
+  },
+  {
+    "url": "https://openreview.net/forum?id=uxpzitPEooJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "As large scale-graphs become increasingly more prevalent, it poses significant computational challenges to process, extract and analyze large graph data. Graph coarsening is one popular technique to reduce the size of a graph while maintaining essential properties. Despite rich graph coarsening literature, there is only limited exploration of data-driven method in the field. In this work, we leverage the recent progress of deep learning on graphs for graph coarsening. We first propose a framework for measuring the quality of coarsening algorithm and show that depending on the goal, we need to carefully choose the Laplace operator on the coarse graph and associated projection/lift operators. Motivated by the observation that the current choice of edge weight for the coarse graph may be sub-optimal, we parametrize the weight assignment map with graph neural networks and train it to improve the coarsening quality in an unsupervised way. Through extensive experiments on both synthetic and real networks, we demonstrate that our method significantly improves common graph coarsening methods under various metrics, reduction ratios, graph sizes, and graph types. It generalizes to graphs of larger size (more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>25</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> of training graphs), adaptive to different losses (both differentiable and non-differentiable), and scales to much larger graphs than previous work.",
+    "title": "Graph Coarsening with Neural Networks",
+    "authors": [
+      "Chen Cai",
+      "Dingkang Wang",
+      "Yusu Wang"
+    ],
+    "emails": [
+      "osu.edu",
+      "~Yusu_Wang1",
+      "~Chen_Cai1"
+    ],
+    "rank": 480
+  },
+  {
+    "url": "https://openreview.net/forum?id=fgX9O5q0BT",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Noise injection is an effective way of circumventing overfitting  and enhancing generalization in machine learning, the rationale of which has been validated in deep learning as well.  Recently, noise injection exhibits surprising performance when\n  generating high-fidelity images in Generative Adversarial Networks (GANs). Despite its successful applications in GANs, the mechanism of its validity is still unclear. In this paper, we propose a geometric framework to theoretically analyze the role of noise injection in GANs. Based on Riemannian geometry, we successfully model the noise injection framework as fuzzy equivalence on geodesic normal coordinates. Guided by our theories, we find that existing methods are incomplete and a new strategy for noise injection is devised. Experiments on image generation and GAN inversion demonstrate the superiority of our method.",
+    "title": "On Noise Injection in Generative Adversarial Networks",
+    "authors": [
+      "Ruili Feng",
+      "Deli Zhao",
+      "Zheng-Jun Zha"
+    ],
+    "emails": [
+      "~Deli_Zhao1",
+      "~Zheng-Jun_Zha2",
+      "~Ruili_Feng1"
+    ],
+    "rank": 481
+  },
+  {
+    "url": "https://openreview.net/forum?id=fAbkE6ant2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Due to the need to store the intermediate activations for back-propagation, end-to-end (E2E) training of deep networks usually suffers from high GPUs memory footprint. This paper aims to address this problem by revisiting the locally supervised learning, where a network is split into gradient-isolated modules and trained with local supervision. We experimentally show that simply training local modules with E2E loss tends to collapse task-relevant information at early layers, and hence hurts the performance of the full model. To avoid this issue, we propose an information propagation (InfoPro) loss, which encourages local modules to preserve as much useful information as possible, while progressively discard task-irrelevant information. As InfoPro loss is difficult to compute in its original form, we derive a feasible upper bound as a surrogate optimization objective, yielding a simple but effective algorithm. In fact, we show that the proposed method boils down to minimizing the combination of a reconstruction loss and a normal cross-entropy/contrastive term. Extensive empirical results on five datasets (i.e., CIFAR, SVHN, STL-10, ImageNet and Cityscapes) validate that InfoPro is capable of achieving competitive performance with less than 40% memory footprint compared to E2E training, while allowing using training data with higher-resolution or larger batch sizes under the same GPU memory constraint. Our method also enables training local modules asynchronously for potential training acceleration.",
+    "title": "Revisiting Locally Supervised Learning: an Alternative to End-to-end Training",
+    "authors": [
+      "Yulin Wang",
+      "Zanlin Ni",
+      "Shiji Song",
+      "Le Yang",
+      "Gao Huang"
+    ],
+    "emails": [
+      "~Zanlin_Ni1",
+      "~Le_Yang2",
+      "~Yulin_Wang1",
+      "~Gao_Huang1",
+      "~Shiji_Song1"
+    ],
+    "rank": 482
+  },
+  {
+    "url": "https://openreview.net/forum?id=kyaIeYj4zZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present GraPPa, an effective pre-training approach for table semantic parsing that learns a compositional inductive bias in the joint representations of textual and tabular data. We construct synthetic question-SQL pairs over high-quality tables via a synchronous context-free grammar (SCFG). We pre-train our model on the synthetic data to inject important structural properties commonly found in semantic parsing into the pre-training language model. To maintain the model's ability to represent real-world data, we also include masked language modeling (MLM) on several existing table-related datasets to regularize our pre-training process.  Our proposed pre-training strategy is much data-efficient. When incorporated with strong base semantic parsers, GraPPa achieves new state-of-the-art results on four popular fully supervised and weakly supervised table semantic parsing tasks.",
+    "title": "GraPPa: Grammar-Augmented Pre-Training for Table Semantic Parsing",
+    "authors": [
+      "Tao Yu",
+      "Chien-Sheng Wu",
+      "Xi Victoria Lin",
+      "bailin wang",
+      "Yi Chern Tan",
+      "Xinyi Yang",
+      "Dragomir Radev",
+      "richard socher",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Chien-Sheng_Wu1",
+      "~Xi_Victoria_Lin1",
+      "yale.edu",
+      "~Dragomir_Radev2",
+      "~richard_socher1",
+      "~Caiming_Xiong1",
+      "~Tao_Yu5",
+      "salesforce.com",
+      "~bailin_wang1"
+    ],
+    "rank": 483
+  },
+  {
+    "url": "https://openreview.net/forum?id=0-EYBhgw80y",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a webly-supervised representation learning method that does not suffer from the annotation unscalability of supervised learning, nor the computation unscalability of self-supervised learning. Most existing works on webly-supervised representation learning adopt a vanilla supervised learning method without accounting for the prevalent noise in the training data, whereas most prior methods in learning with label noise are less effective for real-world large-scale noisy data. We propose momentum prototypes (MoPro), a simple contrastive learning method that achieves online label noise correction, out-of-distribution sample removal, and representation learning. MoPro achieves state-of-the-art performance on WebVision, a weakly-labeled noisy dataset. MoPro also shows superior performance when the pretrained model is transferred to down-stream image classification and detection tasks. It outperforms the ImageNet supervised pretrained model by +10.5 on 1-shot classification on VOC, and outperforms the best self-supervised pretrained model by +17.3 when finetuned on 1% of ImageNet labeled samples. Furthermore, MoPro is more robust to distribution shifts. Code and pretrained models are available at <a href=\"https://github.com/salesforce/MoPro\" target=\"_blank\" rel=\"nofollow\">https://github.com/salesforce/MoPro</a>.",
+    "title": "MoPro: Webly Supervised Learning with Momentum Prototypes",
+    "authors": [
+      "Junnan Li",
+      "Caiming Xiong",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Caiming_Xiong1",
+      "~Junnan_Li2"
+    ],
+    "rank": 484
+  },
+  {
+    "url": "https://openreview.net/forum?id=ONBPHFZ7zG4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      8,
+      5,
+      6
+    ],
+    "rating": "6.48",
+    "confidences": [
+      4,
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Recent work on exploration in reinforcement learning (RL) has led to a series of increasingly complex solutions to the problem. This increase in complexity often comes at the expense of generality. Recent empirical studies suggest that, when applied to a broader set of domains, some sophisticated exploration methods are outperformed by simpler counterparts, such as \u03b5-greedy. In this paper we propose an exploration algorithm that retains the simplicity of \u03b5-greedy while reducing dithering. We build on a simple hypothesis: the main limitation of \u03b5-greedy exploration is its lack of temporal persistence, which limits its ability to escape local optima. We propose a temporally extended form of \u03b5-greedy that simply repeats the sampled action for a random duration. It turns out that, for many duration distributions, this suffices to improve exploration on a large set of domains. Interestingly, a class of distributions inspired by ecological models of animal foraging behaviour yields particularly strong performance.",
+    "title": "Temporally-Extended \u03b5-Greedy Exploration",
+    "authors": [
+      "Will Dabney",
+      "Georg Ostrovski",
+      "Andre Barreto"
+    ],
+    "emails": [
+      "~Andre_Barreto1",
+      "~Georg_Ostrovski1",
+      "~Will_Dabney1"
+    ],
+    "rank": 485
+  },
+  {
+    "url": "https://openreview.net/forum?id=8X2eaSZxTP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "We introduce PC2WF, the first end-to-end trainable deep network architecture to convert a 3D point cloud into a wireframe model. The network takes as input an unordered set of 3D points sampled from the surface of some object, and outputs a wireframe of that object, i.e., a sparse set of corner points linked by line segments. Recovering the wireframe is a challenging task, where the numbers of both vertices and edges are different for every instance, and a-priori unknown. Our architecture gradually builds up the model: It starts by encoding the points into feature vectors. Based on those features, it identifies a pool of candidate vertices, then prunes those candidates to a final set of corner vertices and refines their locations. Next, the corners are linked with an exhaustive set of candidate edges, which is again pruned to obtain the final wireframe. All steps are trainable, and errors can be backpropagated through the entire sequence. We validate the proposed model on a publicly available synthetic dataset, for which the ground truth wireframes are accessible, as well as on a new real-world dataset. Our model produces wireframe abstractions of good quality and outperforms several baselines.",
+    "title": "PC2WF: 3D Wireframe Reconstruction from Raw Point Clouds",
+    "authors": [
+      "Yujia Liu",
+      "Stefano D'Aronco",
+      "Konrad Schindler",
+      "Jan Dirk Wegner"
+    ],
+    "emails": [
+      "~Yujia_Liu3",
+      "~Stefano_D'Aronco1",
+      "~Jan_Dirk_Wegner1",
+      "~Konrad_Schindler1"
+    ],
+    "rank": 486
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ti87Pv5Oc8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "abstract": "Model Agnostic Meta-Learning (MAML) has emerged as a standard framework for meta-learning, where a meta-model is learned with the ability of fast adapting to new tasks. However, as a double-looped optimization problem, MAML needs to differentiate through the whole inner-loop optimization path for every outer-loop training step, which may lead to both computational inefficiency and sub-optimal solutions. In this paper, we generalize MAML to allow meta-learning to be defined in function spaces, and propose the first meta-learning paradigm in the Reproducing Kernel Hilbert Space (RKHS) induced by the meta-model's Neural Tangent Kernel (NTK). Within this paradigm, we introduce two meta-learning algorithms in the RKHS, which no longer need a sub-optimal iterative inner-loop adaptation as in the MAML framework. We achieve this goal by 1) replacing the adaptation with a fast-adaptive regularizer in the RKHS; and 2) solving the adaptation analytically based on the NTK theory. Extensive experimental studies demonstrate advantages of our paradigm in both efficiency and quality of solutions compared to related meta-learning algorithms. Another interesting feature of our proposed methods is that they are demonstrated to be more robust to adversarial attacks and out-of-distribution adaptation than popular baselines, as demonstrated in our experiments.",
+    "title": "Meta-Learning with Neural Tangent Kernels",
+    "authors": [
+      "Yufan Zhou",
+      "Zhenyi Wang",
+      "Jiayi Xian",
+      "Changyou Chen",
+      "Jinhui Xu"
+    ],
+    "emails": [
+      "~Zhenyi_Wang1",
+      "~Jinhui_Xu1",
+      "~Changyou_Chen1",
+      "~Yufan_Zhou1",
+      "buffalo.edu"
+    ],
+    "rank": 487
+  },
+  {
+    "url": "https://openreview.net/forum?id=CR1XOQ0UTh-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We consider the question: how can you sample good negative examples for contrastive learning? We argue that, as with metric learning, learning contrastive representations benefits from hard negative samples (i.e., points that are difficult to distinguish from an anchor point). The key challenge toward using hard negatives is that contrastive methods must remain unsupervised, making it infeasible to adopt existing negative sampling strategies that use label information. In response, we develop a new class of unsupervised methods for selecting hard negative samples where the user can control the amount of hardness. A limiting case of this sampling results in a representation that tightly clusters each class, and pushes different classes as far apart as possible. The proposed method improves downstream performance across multiple modalities, requires only few additional lines of code to implement, and introduces no computational overhead.\n",
+    "title": "Contrastive Learning with Hard Negative Samples",
+    "authors": [
+      "Joshua David Robinson",
+      "Ching-Yao Chuang",
+      "Suvrit Sra",
+      "Stefanie Jegelka"
+    ],
+    "emails": [
+      "~Suvrit_Sra1",
+      "~Stefanie_Jegelka3",
+      "~Ching-Yao_Chuang1",
+      "~Joshua_David_Robinson1"
+    ],
+    "rank": 488
+  },
+  {
+    "url": "https://openreview.net/forum?id=qzBUIzq5XR2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "People can learn rich, general-purpose conceptual representations from only raw perceptual inputs. Current machine learning approaches fall well short of these human standards, although different modeling traditions often have complementary strengths. Symbolic models can capture the compositional and causal knowledge that enables flexible generalization, but they struggle to learn from raw inputs, relying on strong abstractions and simplifying assumptions. Neural network models can learn directly from raw data, but they struggle to capture compositional and causal structure and typically must retrain to tackle new tasks. We bring together these two traditions to learn generative models of concepts that capture rich compositional and causal structure, while learning from raw data. We develop a generative neuro-symbolic (GNS) model of handwritten character concepts that uses the control flow of a probabilistic program, coupled with symbolic stroke primitives and a symbolic image renderer, to represent the causal and compositional processes by which characters are formed. The distributions of parts (strokes), and correlations between parts, are modeled with neural network subroutines, allowing the model to learn directly from raw data and express nonparametric statistical relationships. We apply our model to the Omniglot challenge of human-level concept learning, using a background set of alphabets to learn an expressive prior distribution over character drawings. In a subsequent evaluation, our GNS model uses probabilistic inference to learn rich conceptual representations from a single training image that generalize to 4 unique tasks, succeeding where previous work has fallen short.",
+    "title": "Learning Task-General Representations with Generative Neuro-Symbolic Modeling",
+    "authors": [
+      "Reuben Feinman",
+      "Brenden M. Lake"
+    ],
+    "emails": [
+      "~Reuben_Feinman1",
+      "~Brenden_M._Lake1"
+    ],
+    "rank": 489
+  },
+  {
+    "url": "https://openreview.net/forum?id=N6JECD-PI5w",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Pretrained text encoders, such as BERT, have been applied increasingly in various natural language processing (NLP) tasks, and have recently demonstrated significant performance gains. However, recent studies have demonstrated the existence of social bias in these pretrained NLP models. Although prior works have made progress on word-level debiasing, improved sentence-level fairness of pretrained encoders still lacks exploration. In this paper, we proposed the first neural debiasing method for a pretrained sentence encoder, which transforms the pretrained encoder outputs into debiased representations via a fair filter (FairFil) network. To learn the FairFil, we introduce a contrastive learning framework that not only minimizes the correlation between filtered embeddings and bias words but also preserves rich semantic information of the original sentences. On real-world datasets, our FairFil effectively reduces the bias degree of pretrained text encoders, while continuously showing desirable performance on downstream tasks. Moreover, our post hoc method does not require any retraining of the text encoders, further enlarging FairFil's application space.",
+    "title": "FairFil: Contrastive Neural Debiasing Method for Pretrained Text Encoders",
+    "authors": [
+      "Pengyu Cheng",
+      "Weituo Hao",
+      "Siyang Yuan",
+      "Shijing Si",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "~Pengyu_Cheng1",
+      "~Siyang_Yuan1",
+      "~Shijing_Si1",
+      "~Lawrence_Carin2",
+      "~Weituo_Hao1"
+    ],
+    "rank": 490
+  },
+  {
+    "url": "https://openreview.net/forum?id=xGZG2kS5bFk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Dancing to music is one of human's innate abilities since ancient times. In machine learning research, however, synthesizing dance movements from music is a challenging problem. Recently, researchers synthesize human motion sequences through autoregressive models like recurrent neural network (RNN). Such an approach often generates short sequences due to an accumulation of prediction errors that are fed back into the neural network. This problem becomes even more severe in the long motion sequence generation. Besides, the consistency between dance and music in terms of style, rhythm and beat is yet to be taken into account during modeling. In this paper, we formalize the music-driven dance generation as a sequence-to-sequence learning problem and devise a novel seq2seq architecture to efficiently process long sequences of music features and capture the fine-grained correspondence between music and dance. Furthermore, we propose a novel curriculum learning strategy to alleviate error accumulation of autoregressive models in long motion sequence generation, which gently changes the training process from a fully guided teacher-forcing scheme using the previous ground-truth movements, towards a less guided autoregressive scheme mostly using the generated movements instead. Extensive experiments show that our approach significantly outperforms the existing state-of-the-arts on automatic metrics and human evaluation. We also make a demo video to demonstrate the superior performance of our proposed approach at <a href=\"https://www.youtube.com/watch?v=lmE20MEheZ8\" target=\"_blank\" rel=\"nofollow\">https://www.youtube.com/watch?v=lmE20MEheZ8</a>.",
+    "title": " Dance Revolution: Long-Term Dance Generation with Music via Curriculum Learning",
+    "authors": [
+      "Ruozi Huang",
+      "Huang Hu",
+      "Wei Wu",
+      "Kei Sawada",
+      "Mi Zhang",
+      "Daxin Jiang"
+    ],
+    "emails": [
+      "~Huang_Hu1",
+      "microsoft.com",
+      "fudan.edu.cn",
+      "~Ruozi_Huang1",
+      "~Wei_Wu1"
+    ],
+    "rank": 491
+  },
+  {
+    "url": "https://openreview.net/forum?id=8e6BrwU6AjQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.47",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "This paper focuses on visual counting, which aims to predict the number of occurrences given a natural image and a query (e.g. a question or a category). Unlike most prior works that use explicit, symbolic models which can be computationally expensive and limited in generalization, we propose a simple and effective alternative by revisiting modulated convolutions that fuse the query and the image locally. Following the design of residual bottleneck, we call our method MoVie, short for Modulated conVolutional bottlenecks. Notably, MoVie reasons implicitly and holistically and only needs a single forward-pass during inference. Nevertheless, MoVie showcases strong performance for counting: 1) advancing the state-of-the-art on counting-specific VQA tasks while being more efficient; 2) outperforming prior-art on difficult benchmarks like COCO for common object counting; 3) helped us secure the first place of 2020 VQA challenge when integrated as a module for \u2018number\u2019 related questions in generic VQA models. Finally, we show evidence that modulated convolutions such as MoVie can serve as a general mechanism for reasoning tasks beyond counting.",
+    "title": "MoVie: Revisiting Modulated Convolutions for Visual Counting and Beyond",
+    "authors": [
+      "Duy Kien Nguyen",
+      "Vedanuj Goswami",
+      "Xinlei Chen"
+    ],
+    "emails": [
+      "~Xinlei_Chen1",
+      "~Duy_Kien_Nguyen1",
+      "~Vedanuj_Goswami1"
+    ],
+    "rank": 492
+  },
+  {
+    "url": "https://openreview.net/forum?id=agHLCOBM5jP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.47",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep Learning based methods have emerged as the indisputable leaders for virtually all image restoration tasks. Especially in the domain of microscopy images, various content-aware image restoration (CARE) approaches are now used to improve the interpretability of acquired data. Naturally, there are limitations to what can be restored in corrupted images, and like for all inverse problems, many potential solutions exist, and one of them must be chosen. Here, we propose DivNoising, a denoising approach based on fully convolutional variational autoencoders (VAEs), overcoming the problem of having to choose a single solution by predicting a whole distribution of denoised images. First we introduce a principled way of formulating the unsupervised denoising problem within the VAE framework by explicitly incorporating imaging noise models into the decoder. Our approach is fully unsupervised, only requiring noisy images and a suitable description of the imaging noise distribution. We show that such a noise model can either be measured, bootstrapped from noisy data, or co-learned during training. If desired, consensus predictions can be inferred from a set of DivNoising predictions, leading to competitive results with other unsupervised methods and, on occasion, even with the supervised state-of-the-art. DivNoising samples from the posterior enable a plethora of useful applications. We are (i) showing denoising results for 13 datasets, (ii) discussing how optical character recognition (OCR) applications can benefit from diverse predictions, and are (iii) demonstrating how instance cell segmentation improves when using diverse DivNoising predictions.",
+    "title": "Fully Unsupervised Diversity Denoising with Convolutional Variational Autoencoders",
+    "authors": [
+      "Mangal Prakash",
+      "Alexander Krull",
+      "Florian Jug"
+    ],
+    "emails": [
+      "~Alexander_Krull3",
+      "~Florian_Jug1",
+      "~Mangal_Prakash1"
+    ],
+    "rank": 493
+  },
+  {
+    "url": "https://openreview.net/forum?id=kWSeGEeHvF8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Off-policy evaluation (OPE) holds the promise of being able to leverage large, offline datasets for both evaluating and selecting complex policies for decision making. The ability to learn offline is particularly important in many real-world domains, such as in healthcare, recommender systems, or robotics, where online data collection is an expensive and potentially dangerous process. Being able to accurately evaluate and select high-performing policies without requiring online interaction could yield significant benefits in safety, time, and cost for these applications. While many OPE methods have been proposed in recent years, comparing results between papers is difficult because currently there is a lack of a comprehensive and unified benchmark, and measuring algorithmic progress has been challenging due to the lack of difficult evaluation tasks. In order to address this gap, we present a collection of policies that in conjunction with existing offline datasets can be used for benchmarking off-policy evaluation. Our tasks include a range of challenging high-dimensional continuous control problems, with wide selections of datasets and policies for performing policy selection. The goal of our benchmark is to provide a standardized measure of progress that is motivated from a set of principles designed to challenge and test the limits of existing OPE methods. We perform an evaluation of state-of-the-art algorithms and provide open-source access to our data and code to foster future research in this area.    ",
+    "title": "Benchmarks for Deep Off-Policy Evaluation",
+    "authors": [
+      "Justin Fu",
+      "Mohammad Norouzi",
+      "Ofir Nachum",
+      "George Tucker",
+      "ziyu wang",
+      "Alexander Novikov",
+      "Mengjiao Yang",
+      "Michael R Zhang",
+      "Yutian Chen",
+      "Aviral Kumar",
+      "Cosmin Paduraru",
+      "Sergey Levine",
+      "Thomas Paine"
+    ],
+    "emails": [
+      "~George_Tucker1",
+      "~Yutian_Chen1",
+      "~Ofir_Nachum1",
+      "~Aviral_Kumar2",
+      "~Mohammad_Norouzi1",
+      "~Thomas_Paine1",
+      "~Alexander_Novikov1",
+      "~Justin_Fu1",
+      "~Cosmin_Paduraru1",
+      "~Michael_R_Zhang1",
+      "~Sergey_Levine1",
+      "~Mengjiao_Yang1",
+      "~ziyu_wang1"
+    ],
+    "rank": 494
+  },
+  {
+    "url": "https://openreview.net/forum?id=Xh5eMZVONGF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Source code (Context) and its parsed abstract syntax tree (AST; Structure) are two complementary representations of the same computer program. Traditionally, designers of machine learning models have relied predominantly either on Structure or Context. We propose a new model, which jointly learns on Context and Structure of source code. In contrast to previous approaches, our model uses only language-agnostic features, i.e., source code and features that can be computed directly from the AST. Besides obtaining state-of-the-art on monolingual code summarization on all five programming languages considered in this work, we propose the first multilingual code summarization model. We show that jointly training on non-parallel data from multiple programming languages improves results on all individual languages, where the strongest gains are on low-resource languages. Remarkably, multilingual training only from Context does not lead to the same improvements, highlighting the benefits of combining Structure and Context for representation learning on code.",
+    "title": "Language-Agnostic Representation Learning of Source Code from Structure and Context",
+    "authors": [
+      "Daniel Z\u00fcgner",
+      "Tobias Kirschstein",
+      "Michele Catasta",
+      "Jure Leskovec",
+      "Stephan G\u00fcnnemann"
+    ],
+    "emails": [
+      "~Jure_Leskovec1",
+      "~Daniel_Z\u00fcgner1",
+      "tum.de",
+      "~Michele_Catasta1",
+      "~Stephan_G\u00fcnnemann1"
+    ],
+    "rank": 495
+  },
+  {
+    "url": "https://openreview.net/forum?id=0OlrLvrsHwQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In many domains data is currently represented as graphs and therefore, the graph representation of this data becomes increasingly important in machine learning. Network data is, implicitly or explicitly, always represented using a graph shift operator (GSO) with the most common choices being the adjacency, Laplacian matrices and their normalisations. In this paper, a novel parametrised GSO (PGSO) is proposed, where specific parameter values result in the most commonly used GSOs and message-passing operators in graph neural network (GNN) frameworks. The PGSO is suggested as a replacement of the standard GSOs that are used in state-of-the-art GNN architectures and the optimisation of the PGSO parameters is seamlessly included in the model training. It is proved that the PGSO has real eigenvalues and a set of real eigenvectors independent of the parameter values and spectral bounds on the PGSO are derived. PGSO parameters are shown to adapt to the sparsity of the graph structure in a study on stochastic blockmodel networks, where they are found to automatically replicate the GSO regularisation found in the literature. On several real-world datasets the accuracy of state-of-the-art GNN architectures is improved by the inclusion of the PGSO in both node- and graph-classification tasks. ",
+    "title": "Learning Parametrised Graph Shift Operators",
+    "authors": [
+      "George Dasoulas",
+      "Johannes F. Lutzeyer",
+      "Michalis Vazirgiannis"
+    ],
+    "emails": [
+      "~Michalis_Vazirgiannis1",
+      "~Johannes_F._Lutzeyer1",
+      "~George_Dasoulas1"
+    ],
+    "rank": 496
+  },
+  {
+    "url": "https://openreview.net/forum?id=eo6U4CAwVmg",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recent works in Generative Adversarial Networks (GANs) are actively revisiting various data augmentation techniques as an effective way to prevent discriminator overfitting. It is still unclear, however, that which augmentations could actually improve GANs, and in particular, how to apply a wider range of augmentations in training. In this paper, we propose a novel way to address these questions by incorporating a recent contrastive representation learning scheme into the GAN discriminator, coined ContraD. This \"fusion\" enables the discriminators to work with much stronger augmentations without increasing their training instability, thereby preventing the discriminator overfitting issue in GANs more effectively. Even better, we observe that the contrastive learning itself also benefits from our GAN training, i.e., by maintaining discriminative features between real and fake samples, suggesting a strong coherence between the two worlds: good contrastive representations are also good for GAN discriminators, and vice versa. Our experimental results show that GANs with ContraD consistently improve FID and IS compared to other recent techniques incorporating data augmentations, still maintaining highly discriminative features in the discriminator in terms of the linear evaluation. Finally, as a byproduct, we also show that our GANs trained in an unsupervised manner (without labels) can induce many conditional generative models via a simple latent sampling, leveraging the learned features of ContraD. Code is available at <a href=\"https://github.com/jh-jeong/ContraD\" target=\"_blank\" rel=\"nofollow\">https://github.com/jh-jeong/ContraD</a>.",
+    "title": "Training GANs with Stronger Augmentations via Contrastive Discriminator",
+    "authors": [
+      "Jongheon Jeong",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Jongheon_Jeong1",
+      "~Jinwoo_Shin1"
+    ],
+    "rank": 497
+  },
+  {
+    "url": "https://openreview.net/forum?id=JiYq3eqTKY",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      7,
+      4,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Active learning is a powerful tool when labelling data is expensive, but it introduces a bias because the training data no longer follows the population distribution. We formalize this bias and investigate the situations in which it can be harmful and sometimes even helpful. We further introduce novel corrective weights to remove bias when doing so is beneficial. Through this, our work not only provides a useful mechanism that can improve the active learning approach, but also an explanation for the empirical successes of various existing approaches which ignore this bias. In particular, we show that this bias can be actively helpful when training overparameterized models---like neural networks---with relatively modest dataset sizes.",
+    "title": "On Statistical Bias In Active Learning: How and When to Fix It",
+    "authors": [
+      "Sebastian Farquhar",
+      "Yarin Gal",
+      "Tom Rainforth"
+    ],
+    "emails": [
+      "~Sebastian_Farquhar1",
+      "~Yarin_Gal1",
+      "~Tom_Rainforth1"
+    ],
+    "rank": 498
+  },
+  {
+    "url": "https://openreview.net/forum?id=xTJEN-ggl1b",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We present lambda layers -- an alternative framework to self-attention -- for capturing long-range interactions between an input and structured contextual information (e.g. a pixel surrounded by other pixels). Lambda layers capture such interactions by transforming available contexts into linear functions, termed lambdas, and applying these linear functions to each input separately. Similar to linear attention, lambda layers bypass expensive attention maps, but in contrast, they model both content and position-based interactions which enables their application to large structured inputs such as images. The resulting neural network architectures, LambdaNetworks, significantly outperform their convolutional and attentional counterparts on ImageNet classification, COCO object detection and instance segmentation, while being more computationally efficient. Additionally, we design LambdaResNets, a family of hybrid architectures across different scales, that considerably improves the speed-accuracy tradeoff of image classification models. LambdaResNets reach excellent accuracies on ImageNet while being 3.2 - 4.4x faster than the popular EfficientNets on modern machine learning accelerators. In large-scale semi-supervised training with an additional 130M pseudo-labeled images, LambdaResNets achieve up to 86.7% ImageNet accuracy while being 9.5x faster than EfficientNet NoisyStudent and 9x faster than a Vision Transformer with comparable accuracies.",
+    "title": "LambdaNetworks: Modeling long-range Interactions without Attention",
+    "authors": [
+      "Irwan Bello"
+    ],
+    "emails": [
+      "~Irwan_Bello1"
+    ],
+    "rank": 499
+  },
+  {
+    "url": "https://openreview.net/forum?id=ETBc_MIMgoX",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.47",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "A key challenge for reinforcement learning (RL) consists of learning in environments with sparse extrinsic rewards. In contrast to current RL methods, humans are able to learn new skills with little or no reward by using various forms of intrinsic motivation. We propose AMIGo, a novel agent incorporating -- as form of meta-learning -- a goal-generating teacher that proposes Adversarially Motivated Intrinsic Goals to train a goal-conditioned \"student\" policy in the absence of (or alongside) environment reward. Specifically, through a simple but effective \"constructively adversarial\" objective, the teacher learns to propose increasingly challenging -- yet achievable -- goals that allow the student to learn general skills for acting in a new environment, independent of the task to be solved. We show that our method generates a natural curriculum of self-proposed goals which ultimately allows the agent to solve challenging procedurally-generated tasks where other forms of intrinsic motivation and state-of-the-art RL methods fail.",
+    "title": "Learning with AMIGo: Adversarially Motivated Intrinsic Goals",
+    "authors": [
+      "Andres Campero",
+      "Roberta Raileanu",
+      "Heinrich Kuttler",
+      "Joshua B. Tenenbaum",
+      "Tim Rockt\u00e4schel",
+      "Edward Grefenstette"
+    ],
+    "emails": [
+      "~Edward_Grefenstette1",
+      "~Andres_Campero1",
+      "~Tim_Rockt\u00e4schel1",
+      "~Roberta_Raileanu2",
+      "~Joshua_B._Tenenbaum1",
+      "~Heinrich_Kuttler1"
+    ],
+    "rank": 500
+  },
+  {
+    "url": "https://openreview.net/forum?id=hr-3PMvDpil",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.46",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Adversarial patches pose a realistic threat model for physical world attacks on autonomous systems via their perception component. Autonomous systems in safety-critical domains such as automated driving should thus contain a fail-safe fallback component that combines certifiable robustness against patches with efficient inference while maintaining high performance on clean inputs. We propose BagCert, a novel combination of model architecture and certification procedure that allows efficient certification. We derive a loss that enables end-to-end optimization of certified robustness against patches of different sizes and locations. On CIFAR10, BagCert certifies 10.000 examples in 43 seconds on a single GPU and obtains 86% clean and 60% certified accuracy against 5x5 patches.",
+    "title": "Efficient Certified Defenses Against Patch Attacks on Image Classifiers",
+    "authors": [
+      "Jan Hendrik Metzen",
+      "Maksym Yatsura"
+    ],
+    "emails": [
+      "~Jan_Hendrik_Metzen1",
+      "~Maksym_Yatsura1"
+    ],
+    "rank": 501
+  },
+  {
+    "url": "https://openreview.net/forum?id=hu2aMLzOxC",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.46",
+    "confidences": [
+      4,
+      4,
+      2,
+      3
+    ],
+    "abstract": "We train a single, goal-conditioned policy that can solve many robotic manipulation tasks, including tasks with previously unseen goals and objects. To do so, we rely on asymmetric self-play for goal discovery, where two agents, Alice and Bob, play a game.  Alice is asked to propose challenging goals and Bob aims to solve them.  We show that this method is able to discover highly diverse and complex goals without any human priors.  We further show that Bob can be trained with only sparse rewards, because the interaction between Alice and Bob results in a natural curriculum and Bob can learn from Alice's trajectory when relabeled as a goal-conditioned demonstration.  Finally, we show that our method scales, resulting in a single policy that can transfer to many unseen hold-out tasks such as setting a table, stacking blocks, and solving simple puzzles. Videos of a learned policy is available at <a href=\"https://robotics-self-play.github.io.\" target=\"_blank\" rel=\"nofollow\">https://robotics-self-play.github.io.</a>",
+    "title": "Asymmetric self-play for automatic goal discovery in robotic manipulation",
+    "authors": [
+      "OpenAI OpenAI",
+      "Matthias Plappert",
+      "Raul Sampedro",
+      "Tao Xu",
+      "Ilge Akkaya",
+      "Vineet Kosaraju",
+      "Peter Welinder",
+      "Ruben D'Sa",
+      "Arthur Petron",
+      "Henrique Ponde de Oliveira Pinto",
+      "Alex Paino",
+      "Hyeonwoo Noh",
+      "Lilian Weng",
+      "Qiming Yuan",
+      "Casey Chu",
+      "Wojciech Zaremba"
+    ],
+    "emails": [
+      "~Casey_Chu1",
+      "~Matthias_Plappert1",
+      "~Wojciech_Zaremba1",
+      "~Hyeonwoo_Noh1",
+      "openai.com",
+      "~Vineet_Kosaraju1",
+      "~Lilian_Weng1"
+    ],
+    "rank": 502
+  },
+  {
+    "url": "https://openreview.net/forum?id=YwpZmcAehZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.45",
+    "confidences": [
+      3,
+      3,
+      3,
+      2
+    ],
+    "abstract": "Recent research in dynamic convolution shows substantial performance boost for efficient CNNs, due to the adaptive aggregation of K static convolution kernels. It has two limitations: (a) it increases the number of convolutional weights by K-times, and (b) the joint optimization of dynamic attention and static convolution kernels is challenging. In this paper, we revisit it from a new perspective of matrix decomposition and reveal the key issue is that dynamic convolution applies dynamic attention over channel groups after projecting into a higher dimensional latent space. To address this issue, we propose dynamic channel fusion to replace dynamic attention over channel groups. Dynamic channel fusion not only enables significant dimension reduction of the latent space, but also mitigates the joint optimization difficulty. As a result, our method is easier to train and requires significantly fewer parameters without sacrificing accuracy. Source code is at <a href=\"https://github.com/liyunsheng13/dcd\" target=\"_blank\" rel=\"nofollow\">https://github.com/liyunsheng13/dcd</a>.",
+    "title": "Revisiting Dynamic Convolution via Matrix Decomposition",
+    "authors": [
+      "Yunsheng Li",
+      "Yinpeng Chen",
+      "Xiyang Dai",
+      "mengchen liu",
+      "Dongdong Chen",
+      "Ye Yu",
+      "Lu Yuan",
+      "Zicheng Liu",
+      "Mei Chen",
+      "Nuno Vasconcelos"
+    ],
+    "emails": [
+      "~Nuno_Vasconcelos1",
+      "~Mei_Chen2",
+      "~Lu_Yuan1",
+      "microsoft.com",
+      "~Yinpeng_Chen1",
+      "~Dongdong_Chen1",
+      "~Xiyang_Dai2",
+      "~Zicheng_Liu1",
+      "~Yunsheng_Li1"
+    ],
+    "rank": 503
+  },
+  {
+    "url": "https://openreview.net/forum?id=NsMLjcFaO8O",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      7,
+      5
+    ],
+    "rating": "6.44",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "This paper introduces WaveGrad, a conditional model for waveform generation which estimates gradients of the data density. The model is built on prior work on score matching and diffusion probabilistic models. It starts from a Gaussian white noise signal and iteratively refines the signal via a gradient-based sampler conditioned on the mel-spectrogram.\nWaveGrad offers a natural way to trade inference speed for sample quality by adjusting the number of refinement steps, and bridges the gap between non-autoregressive and autoregressive models in terms of audio quality.\nWe find that it can generate high fidelity audio samples using as few as six iterations.\nExperiments reveal WaveGrad to generate high fidelity audio, outperforming adversarial non-autoregressive baselines and matching a strong likelihood-based autoregressive baseline using fewer sequential operations.  Audio samples are available at <a href=\"https://wavegrad.github.io\" target=\"_blank\" rel=\"nofollow\">https://wavegrad.github.io</a>/.",
+    "title": "WaveGrad: Estimating Gradients for Waveform Generation",
+    "authors": [
+      "Nanxin Chen",
+      "Yu Zhang",
+      "Heiga Zen",
+      "Ron J Weiss",
+      "Mohammad Norouzi",
+      "William Chan"
+    ],
+    "emails": [
+      "~Mohammad_Norouzi1",
+      "~Ron_J_Weiss1",
+      "~Heiga_Zen1",
+      "~Yu_Zhang2",
+      "~William_Chan1",
+      "~Nanxin_Chen1"
+    ],
+    "rank": 504
+  },
+  {
+    "url": "https://openreview.net/forum?id=SRDuJssQud",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.44",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "We consider the fundamental problem of how to automatically construct summary statistics for implicit generative models where the evaluation of the likelihood function is intractable but sampling data from the model is possible. The idea is to frame the task of constructing sufficient statistics as learning mutual information maximizing representations of the data with the help of deep neural networks. The infomax learning procedure does not need to estimate any density or density ratio. We apply our approach to both traditional approximate Bayesian computation and recent neural likelihood methods, boosting their performance on a range of tasks. ",
+    "title": "Neural Approximate Sufficient Statistics for Implicit Models",
+    "authors": [
+      "Yanzhi Chen",
+      "Dinghuai Zhang",
+      "Michael U. Gutmann",
+      "Aaron Courville",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "~Zhanxing_Zhu1",
+      "~Aaron_Courville3",
+      "~Yanzhi_Chen1",
+      "~Michael_U._Gutmann1",
+      "~Dinghuai_Zhang1"
+    ],
+    "rank": 505
+  },
+  {
+    "url": "https://openreview.net/forum?id=P6_q1BRxY8Q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      8,
+      6,
+      4
+    ],
+    "rating": "6.44",
+    "confidences": [
+      3,
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We study the multi-agent safe control problem where agents should avoid collisions to static obstacles and collisions with each other while reaching their goals. Our core idea is to learn the multi-agent control policy jointly with  learning the control barrier functions as safety certificates. We propose a new joint-learning framework that can be implemented in a decentralized fashion, which can adapt to an arbitrarily large number of agents. Building upon this framework, we further improve the scalability by  incorporating neural network architectures  that are invariant to the quantity and permutation of neighboring agents. In addition, we propose a new spontaneous policy refinement method to further enforce the certificate condition during testing. We provide extensive experiments to demonstrate that our method significantly outperforms other leading multi-agent control approaches in terms of maintaining safety and completing original tasks. Our approach also shows substantial generalization capability in that the control policy can be trained with 8 agents in one scenario, while being used on other scenarios with up to 1024 agents in complex multi-agent environments and dynamics. Videos and source code can be found at <a href=\"https://realm.mit.edu/blog/learning-safe-multi-agent-control-decentralized-neural-barrier-certificates\" target=\"_blank\" rel=\"nofollow\">https://realm.mit.edu/blog/learning-safe-multi-agent-control-decentralized-neural-barrier-certificates</a>.",
+    "title": "Learning Safe Multi-agent Control with Decentralized Neural Barrier Certificates",
+    "authors": [
+      "Zengyi Qin",
+      "Kaiqing Zhang",
+      "Yuxiao Chen",
+      "Jingkai Chen",
+      "Chuchu Fan"
+    ],
+    "emails": [
+      "~Zengyi_Qin1",
+      "~Jingkai_Chen2",
+      "~Chuchu_Fan2",
+      "~Yuxiao_Chen1",
+      "~Kaiqing_Zhang3"
+    ],
+    "rank": 506
+  },
+  {
+    "url": "https://openreview.net/forum?id=_IM-AfFhna9",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      4
+    ],
+    "rating": "6.44",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Continual learning deals with training models on new tasks and datasets in an online fashion. One strand of research has used probabilistic regularization for continual learning, with two of the main approaches in this vein being Online Elastic Weight Consolidation (Online EWC) and Variational Continual Learning (VCL). VCL employs variational inference, which in other settings has been improved empirically by applying likelihood-tempering. We show that applying this modification to VCL recovers Online EWC as a limiting case, allowing for interpolation between the two approaches. We term the general algorithm Generalized VCL (GVCL). In order to mitigate the observed overpruning effect of VI, we take inspiration from a common multi-task architecture, neural networks with task-specific FiLM layers, and find that this addition leads to significant performance gains, specifically for variational methods. In the small-data regime, GVCL strongly outperforms existing baselines. In larger datasets, GVCL with FiLM layers outperforms or is competitive with existing baselines in terms of accuracy, whilst also providing significantly better calibration.",
+    "title": "Generalized Variational Continual Learning",
+    "authors": [
+      "Noel Loo",
+      "Siddharth Swaroop",
+      "Richard E Turner"
+    ],
+    "emails": [
+      "~Noel_Loo1",
+      "~Siddharth_Swaroop2",
+      "~Richard_E_Turner1"
+    ],
+    "rank": 507
+  },
+  {
+    "url": "https://openreview.net/forum?id=LhY8QdUGSuw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.44",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Catastrophic forgetting is a recurring challenge to developing versatile deep learning models. Despite its ubiquity, there is limited understanding of its connections to neural network (hidden) representations and task semantics. In this paper, we address this important knowledge gap. Through quantitative analysis of neural representations, we find that deeper layers are disproportionately responsible for forgetting, with sequential training resulting in an erasure of earlier task representational subspaces. Methods to mitigate forgetting stabilize these deeper layers, but show diversity on precise effects, with some increasing feature reuse while others store task representations orthogonally, preventing interference. These insights also enable the development of an analytic argument and empirical picture relating forgetting to task semantic similarity, where we find that maximal forgetting occurs for task sequences with intermediate similarity.",
+    "title": "Anatomy of Catastrophic Forgetting: Hidden Representations and Task Semantics",
+    "authors": [
+      "Vinay Venkatesh Ramasesh",
+      "Ethan Dyer",
+      "Maithra Raghu"
+    ],
+    "emails": [
+      "~Maithra_Raghu1",
+      "~Ethan_Dyer1",
+      "~Vinay_Venkatesh_Ramasesh1"
+    ],
+    "rank": 508
+  },
+  {
+    "url": "https://openreview.net/forum?id=H6ATjJ0TKdf",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      5,
+      7
+    ],
+    "rating": "6.44",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Recent discoveries on neural network pruning reveal that, with a carefully chosen layerwise sparsity, a simple magnitude-based pruning achieves state-of-the-art tradeoff between sparsity and performance. However, without a clear consensus on ``how to choose,'' the layerwise sparsities are mostly selected algorithm-by-algorithm, often resorting to handcrafted heuristics or an extensive hyperparameter search. To fill this gap, we propose a novel importance score for global pruning, coined layer-adaptive magnitude-based pruning (LAMP) score; the score is a rescaled version of weight magnitude that incorporates the model-level <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> distortion incurred by pruning, and does not require any hyperparameter tuning or heavy computation.\nUnder various image classification setups, LAMP consistently outperforms popular existing schemes for layerwise sparsity selection.\nFurthermore, we observe that LAMP continues to outperform baselines even in weight-rewinding setups, while the connectivity-oriented layerwise sparsity (the strongest baseline overall) performs worse than a simple global magnitude-based pruning in this case. Code: <a href=\"https://github.com/jaeho-lee/layer-adaptive-sparsity\" target=\"_blank\" rel=\"nofollow\">https://github.com/jaeho-lee/layer-adaptive-sparsity</a>",
+    "title": "Layer-adaptive Sparsity for the Magnitude-based Pruning",
+    "authors": [
+      "Jaeho Lee",
+      "Sejun Park",
+      "Sangwoo Mo",
+      "Sungsoo Ahn",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Sangwoo_Mo1",
+      "~Jaeho_Lee3",
+      "~Sejun_Park1",
+      "~Jinwoo_Shin1",
+      "~Sungsoo_Ahn1"
+    ],
+    "rank": 509
+  },
+  {
+    "url": "https://openreview.net/forum?id=xHqKw3xJQhi",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.44",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Over-smoothing has emerged as a severe problem for node classification with graph convolutional networks (GCNs). In the view of message passing, the over-smoothing issue is caused by the observed noisy graph topology that would propagate information along inter-class edges, and consequently, over-mix the features of nodes in different classes. In this paper, we propose a novel architecture, namely VEM-GCN, to address this problem by employing the variational EM algorithm to jointly optimize the graph topology and learn desirable node representations for classification. Specifically, variational EM approaches a latent adjacency matrix parameterized by the assortative-constrained stochastic block model (SBM) to enhance intra-class connection and suppress inter-class interaction of the observed noisy graph. In the variational E-step, graph topology is optimized by approximating the posterior probability distribution of the latent adjacency matrix with a neural network learned from node embeddings. In the M-step, node representations are learned using the graph convolutional network based on the refined graph topology for the downstream task of classification. VEM-GCN is demonstrated to outperform existing strategies for tackling over-smoothing and optimizing graph topology in node classification on seven benchmark datasets.",
+    "title": "VEM-GCN: Topology Optimization with Variational EM for Graph Convolutional Networks",
+    "authors": [
+      "Rui Yang",
+      "Wenrui Dai",
+      "Chenglin Li",
+      "Junni Zou",
+      "Hongkai Xiong"
+    ],
+    "emails": [
+      "~Chenglin_Li2",
+      "~Hongkai_Xiong1",
+      "~Rui_Yang7",
+      "~Wenrui_Dai1",
+      "~Junni_Zou1"
+    ],
+    "rank": 510
+  },
+  {
+    "url": "https://openreview.net/forum?id=EKV158tSfwv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.43",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Existing literature in Continual Learning (CL) has focused on overcoming catastrophic forgetting, the inability of the learner to recall how to perform tasks observed in the past. \nThere are however other desirable properties of a CL system, such as the ability to transfer knowledge from previous tasks and to scale memory and compute sub-linearly with the number of tasks. Since most current benchmarks focus only on forgetting using short streams of tasks, we first propose a new suite of benchmarks to probe CL algorithms across these new axes. \nFinally, we introduce a new modular architecture, whose modules represent atomic skills that can be composed to perform a certain task. Learning a task reduces to figuring out which past modules to re-use, and which new modules to instantiate to solve the current task. Our learning algorithm leverages a task-driven prior over the exponential search space of all possible ways to combine modules, enabling efficient learning on long streams of tasks. \nOur experiments show that this modular architecture and learning algorithm perform competitively on widely used CL benchmarks while yielding superior performance on the more challenging benchmarks we introduce in this work. The Benchmark is publicly available at <a href=\"https://github.com/facebookresearch/CTrLBenchmark\" target=\"_blank\" rel=\"nofollow\">https://github.com/facebookresearch/CTrLBenchmark</a>.",
+    "title": "Efficient Continual Learning with Modular Networks and Task-Driven Priors",
+    "authors": [
+      "Tom Veniat",
+      "Ludovic Denoyer",
+      "MarcAurelio Ranzato"
+    ],
+    "emails": [
+      "~Tom_Veniat1",
+      "~MarcAurelio_Ranzato1",
+      "~Ludovic_Denoyer1"
+    ],
+    "rank": 511
+  },
+  {
+    "url": "https://openreview.net/forum?id=I4c4K9vBNny",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.43",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "How to improve generative modeling by better exploiting spatial regularities and coherence in images? We introduce a novel neural network for building image generators (decoders) and apply it to variational autoencoders (VAEs). In our spatial dependency networks (SDNs), feature maps at each level of a deep neural net are computed in a spatially coherent way, using a sequential gating-based mechanism that distributes contextual information across 2-D space. We show that augmenting the decoder of a hierarchical VAE by spatial dependency layers considerably improves density estimation over baseline convolutional architectures and the state-of-the-art among the models within the same class. Furthermore, we demonstrate that SDN can be applied to large images by synthesizing samples of high quality and coherence. In a vanilla VAE setting, we find that a powerful SDN decoder also improves learning disentangled representations, indicating that neural architectures play an important role in this task. Our results suggest favoring spatial dependency over convolutional layers in various VAE settings. The accompanying source code is given at <a href=\"https://github.com/djordjemila/sdn\" target=\"_blank\" rel=\"nofollow\">https://github.com/djordjemila/sdn</a>.",
+    "title": "Spatial Dependency Networks: Neural Layers for Improved Generative Image Modeling",
+    "authors": [
+      "\u0110or\u0111e Miladinovi\u0107",
+      "Aleksandar Stani\u0107",
+      "Stefan Bauer",
+      "J\u00fcrgen Schmidhuber",
+      "Joachim M. Buhmann"
+    ],
+    "emails": [
+      "~Stefan_Bauer1",
+      "~J\u00fcrgen_Schmidhuber1",
+      "~\u0110or\u0111e_Miladinovi\u01071",
+      "~Joachim_M._Buhmann1",
+      "~Aleksandar_Stani\u01071"
+    ],
+    "rank": 512
+  },
+  {
+    "url": "https://openreview.net/forum?id=1AoMhc_9jER",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.43",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep generative adversarial networks (GANs) have gained growing popularity in numerous scenarios, while usually suffer from high parameter complexities for resource-constrained real-world applications. However, the compression of GANs has less been explored. A few works show that heuristically applying compression techniques normally leads to unsatisfactory results, due to the notorious training instability of GANs. In parallel, the lottery ticket hypothesis shows prevailing success on discriminative models, in locating sparse matching subnetworks capable of training in isolation to full model performance. In this work, we for the first time study the existence of such trainable matching subnetworks in deep GANs. For a range of GANs, we certainly find matching subnetworks at <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>67</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>74</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> sparsity. We observe that with or without pruning discriminator has a minor effect on the existence and quality of matching subnetworks, while the initialization weights used in the discriminator plays a significant role. We then show the powerful transferability of these subnetworks to unseen tasks. Furthermore, extensive experimental results demonstrate that our found subnetworks substantially outperform previous state-of-the-art GAN compression approaches in both image generation (e.g. SNGAN) and image-to-image translation GANs (e.g. CycleGAN). Codes available at <a href=\"https://github.com/VITA-Group/GAN-LTH\" target=\"_blank\" rel=\"nofollow\">https://github.com/VITA-Group/GAN-LTH</a>.",
+    "title": "GANs Can Play Lottery Tickets Too",
+    "authors": [
+      "Xuxi Chen",
+      "Zhenyu Zhang",
+      "Yongduo Sui",
+      "Tianlong Chen"
+    ],
+    "emails": [
+      "~Xuxi_Chen1",
+      "~Yongduo_Sui1",
+      "~Zhenyu_Zhang4",
+      "~Tianlong_Chen1"
+    ],
+    "rank": 513
+  },
+  {
+    "url": "https://openreview.net/forum?id=AhElGnhU2BV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      8,
+      4
+    ],
+    "rating": "6.43",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this work, we examine the security of InstaHide, a scheme recently proposed by \\cite{hsla20} for preserving the security of private datasets in the context of distributed learning. To generate a synthetic training example to be shared among the distributed learners, InstaHide takes a convex combination of private feature vectors and randomly flips the sign of each entry of the resulting vector with probability 1/2. A salient question is whether this scheme is secure in any provable sense, perhaps under a plausible complexity-theoretic assumption. \n\nThe answer to this turns out to be quite subtle and closely related to the average-case complexity of a multi-task, missing-data version of the classic problem of phase retrieval that is interesting in its own right. Motivated by this connection, under the standard distributional assumption that the public/private feature vectors are isotropic Gaussian, we design an algorithm that can actually recover a private vector using only the public vectors and a sequence of synthetic vectors generated by InstaHide.",
+    "title": "On InstaHide, Phase Retrieval, and Sparse Matrix Factorization",
+    "authors": [
+      "Sitan Chen",
+      "Xiaoxiao Li",
+      "Zhao Song",
+      "Danyang Zhuo"
+    ],
+    "emails": [
+      "~Xiaoxiao_Li1",
+      "~Danyang_Zhuo1",
+      "~Sitan_Chen1",
+      "~Zhao_Song3"
+    ],
+    "rank": 514
+  },
+  {
+    "url": "https://openreview.net/forum?id=K5j7D81ABvt",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      4,
+      7
+    ],
+    "rating": "6.43",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose the task of \\emph{disambiguating} symbolic expressions in informal STEM documents in the form of \\LaTeX files -- that is, determining their precise semantics and abstract syntax tree -- as a neural machine translation task. We discuss the distinct challenges involved and present a dataset with roughly 33,000 entries. We evaluated several baseline models on this dataset, which failed to yield even syntactically valid \\LaTeX before overfitting. Consequently, we describe a methodology using a \\emph{transformer} language model pre-trained on sources obtained from \\url{arxiv.org}, which yields promising results despite the small size of the dataset. We evaluate our model using a plurality of dedicated techniques, taking syntax and semantics of symbolic expressions into account.",
+    "title": "Disambiguating Symbolic Expressions in Informal Documents",
+    "authors": [
+      "Dennis M\u00fcller",
+      "Cezary Kaliszyk"
+    ],
+    "emails": [
+      "~Cezary_Kaliszyk1",
+      "~Dennis_M\u00fcller1"
+    ],
+    "rank": 515
+  },
+  {
+    "url": "https://openreview.net/forum?id=c8P9NQVtmnO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      8,
+      5
+    ],
+    "rating": "6.43",
+    "confidences": [
+      3,
+      2,
+      4,
+      5
+    ],
+    "abstract": "The classical development of neural networks has primarily focused on learning mappings between finite-dimensional Euclidean spaces.  Recently, this has been generalized to neural operators that learn mappings between function spaces. For partial differential equations (PDEs), neural operators directly learn the mapping from any functional parametric dependence to the solution. Thus, they learn an entire family of PDEs, in contrast to classical methods which solve one instance of the equation. In this work, we formulate a new neural operator by parameterizing the integral kernel directly in Fourier space, allowing for an expressive and efficient architecture. We perform experiments on Burgers' equation, Darcy flow, and Navier-Stokes equation. The Fourier neural operator is the first ML-based method to successfully model turbulent flows with zero-shot super-resolution. It is up to three orders of magnitude faster compared to traditional PDE solvers. Additionally, it achieves superior accuracy compared to previous learning-based solvers under fixed resolution.",
+    "title": "Fourier Neural Operator for Parametric Partial Differential Equations",
+    "authors": [
+      "Zongyi Li",
+      "Nikola Borislavov Kovachki",
+      "Kamyar Azizzadenesheli",
+      "Burigede liu",
+      "Kaushik Bhattacharya",
+      "Andrew Stuart",
+      "Anima Anandkumar"
+    ],
+    "emails": [
+      "~Andrew_Stuart2",
+      "~Kamyar_Azizzadenesheli1",
+      "~Nikola_Borislavov_Kovachki1",
+      "~Zongyi_Li1",
+      "~Kaushik_Bhattacharya1",
+      "~Anima_Anandkumar1",
+      "caltech.edu"
+    ],
+    "rank": 516
+  },
+  {
+    "url": "https://openreview.net/forum?id=BfayGoTV4iQ",
+    "decision": "Reject",
+    "ratings": [
+      9,
+      4,
+      6,
+      6
+    ],
+    "rating": "6.43",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Sketch drawings are an intuitive visual domain that appeals to human instinct. Previous work has shown that recurrent neural networks are capable of producing sketch drawings of a single or few classes at a time. In this work we investigate representations developed by training a generative model to produce sketches from pixel images across many classes in a sketch domain. We find that the embeddings learned by this sketching model are extremely informative for visual tasks and infer a unique visual understanding. We then use them to exceed state-of-the-art performance in unsupervised few-shot classification on the Omniglot and mini-ImageNet benchmarks. We also leverage the generative capacity of our model to produce high quality sketches of novel classes based on just a single example. ",
+    "title": "SketchEmbedNet: Learning Novel Concepts by Imitating Drawings",
+    "authors": [
+      "Alexander Wang",
+      "Mengye Ren",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Mengye_Ren1",
+      "~Richard_Zemel1",
+      "~Alexander_Wang1"
+    ],
+    "rank": 517
+  },
+  {
+    "url": "https://openreview.net/forum?id=gwFTuzxJW0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      5,
+      9,
+      7
+    ],
+    "rating": "6.43",
+    "confidences": [
+      2,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The vulnerability of deep networks to adversarial attacks is a central problem for deep learning from the perspective of both cognition and security. The current most successful defense method is to train a classifier using adversarial images created during learning. Another defense approach involves transformation or purification of the original input to remove adversarial signals before the image is classified. We focus on defending naturally-trained classifiers using Markov Chain Monte Carlo (MCMC) sampling with an Energy-Based Model (EBM) for adversarial purification. In contrast to adversarial training, our approach is intended to secure highly vulnerable pre-existing classifiers. To our knowledge, no prior defensive transformation is capable of securing naturally-trained classifiers, and our method is the first to validate a post-training defense approach that is distinct from current successful defenses which modify classifier training.\n\nThe memoryless behavior of long-run MCMC sampling will eventually remove adversarial signals, while metastable behavior preserves consistent appearance of MCMC samples after many steps to allow accurate long-run prediction. Balancing these factors can lead to effective purification and robust classification. We evaluate adversarial defense with an EBM using the strongest known attacks against purification. Our contributions are 1) an improved method for training EBM's with realistic long-run MCMC samples for effective purification, 2) an Expectation-Over-Transformation (EOT) defense that resolves ambiguities for evaluating stochastic defenses and from which the EOT attack naturally follows, and 3) state-of-the-art adversarial defense for naturally-trained classifiers and competitive defense compared to adversarial training on CIFAR-10, SVHN, and CIFAR-100. Our code and pre-trained models are available at <a href=\"https://github.com/point0bar1/ebm-defense\" target=\"_blank\" rel=\"nofollow\">https://github.com/point0bar1/ebm-defense</a>.",
+    "title": "Stochastic Security: Adversarial Defense Using Long-Run Dynamics of Energy-Based Models",
+    "authors": [
+      "Mitch Hill",
+      "Jonathan Craig Mitchell",
+      "Song-Chun Zhu"
+    ],
+    "emails": [
+      "~Jonathan_Craig_Mitchell1",
+      "~Mitch_Hill1",
+      "~Song-Chun_Zhu1"
+    ],
+    "rank": 518
+  },
+  {
+    "url": "https://openreview.net/forum?id=KtH8W3S_RE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.43",
+    "confidences": [
+      3,
+      3,
+      1
+    ],
+    "abstract": "Detection of cancer-causing mutations within the vast and mostly unexplored human genome is a major challenge. Doing so requires modeling the background mutation rate, a highly non-stationary stochastic process, across regions of interest varying in size from one to millions of positions. Here, we present the split-Poisson-Gamma (SPG) distribution, an extension of the classical Poisson-Gamma formulation, to model a discrete stochastic process at multiple resolutions. We demonstrate that the probability model has a closed-form posterior, enabling efficient and accurate linear-time prediction over any length scale after the parameters of the model have been inferred a single time. We apply our framework to model mutation rates in tumors and show that model parameters can be accurately inferred from high-dimensional epigenetic data using a convolutional neural network, Gaussian process, and maximum-likelihood estimation. Our method is both more accurate and more efficient than existing models over a large range of length scales. We demonstrate the usefulness of multi-resolution modeling by detecting genomic elements that drive tumor emergence and are of vastly differing sizes.",
+    "title": "Multi-resolution modeling of a discrete stochastic process identifies causes of cancer",
+    "authors": [
+      "Adam Uri Yaari",
+      "Maxwell Sherman",
+      "Oliver Clarke Priebe",
+      "Po-Ru Loh",
+      "Boris Katz",
+      "Andrei Barbu",
+      "Bonnie Berger"
+    ],
+    "emails": [
+      "~Bonnie_Berger1",
+      "~Boris_Katz1",
+      "~Maxwell_Sherman1",
+      "~Po-Ru_Loh1",
+      "~Oliver_Clarke_Priebe1",
+      "~Adam_Uri_Yaari1",
+      "~Andrei_Barbu3"
+    ],
+    "rank": 519
+  },
+  {
+    "url": "https://openreview.net/forum?id=aYuZO9DIdnn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.42",
+    "confidences": [
+      5,
+      3,
+      2,
+      2
+    ],
+    "abstract": "A recent line of work showed that  various forms of convolutional  kernel methods can be competitive with standard supervised deep convolutional networks on datasets like CIFAR-10, obtaining accuracies in the range of 87-90% while being more amenable to theoretical analysis. In this work, we highlight the importance of a data-dependent feature extraction step that is key to the obtain good performance in convolutional kernel methods. This step typically corresponds to a whitened dictionary of patches, and gives rise to a data-driven convolutional kernel methods.We extensively study its effect, demonstrating it is the key ingredient for high performance of these methods. Specifically, we show that one of the simplest instances of such kernel methods, based on a single layer of  image patches followed by a linear classifier is already obtaining classification accuracies on CIFAR-10 in the same range as previous more sophisticated convolutional kernel methods. We scale this method to the challenging ImageNet dataset, showing such a simple approach can exceed all existing non-learned representation methods. This is a new baseline for object recognition without representation learning methods, that  initiates the investigation of  convolutional kernel models  on ImageNet. We conduct experiments to analyze the dictionary that we used, our ablations showing they exhibit low-dimensional properties.",
+    "title": "The Unreasonable Effectiveness of Patches in Deep Convolutional Kernels Methods",
+    "authors": [
+      "Louis THIRY",
+      "Michael Arbel",
+      "Eugene Belilovsky",
+      "Edouard Oyallon"
+    ],
+    "emails": [
+      "~Louis_THIRY1",
+      "~Eugene_Belilovsky1",
+      "~Edouard_Oyallon1",
+      "~Michael_Arbel1"
+    ],
+    "rank": 520
+  },
+  {
+    "url": "https://openreview.net/forum?id=LGgdb4TS4Z",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      8,
+      5,
+      6
+    ],
+    "rating": "6.42",
+    "confidences": [
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In the segmentation of fine-scale structures from natural and biomedical images, per-pixel accuracy is not the only metric of concern. Topological correctness, such as vessel connectivity and membrane closure, is crucial for downstream analysis tasks. In this paper, we propose a new approach to train deep image segmentation networks for better topological accuracy. In particular, leveraging the power of discrete Morse theory (DMT), we identify global structures, including 1D skeletons and 2D patches, which are important for topological accuracy. Trained with a novel loss based on these global structures, the network performance is significantly improved especially near topologically challenging locations (such as weak spots of connections and membranes). On diverse datasets, our method achieves superior performance on both the DICE score and topological metrics.",
+    "title": "Topology-Aware Segmentation Using Discrete Morse Theory",
+    "authors": [
+      "Xiaoling Hu",
+      "Yusu Wang",
+      "Li Fuxin",
+      "Dimitris Samaras",
+      "Chao Chen"
+    ],
+    "emails": [
+      "~Dimitris_Samaras3",
+      "~Yusu_Wang1",
+      "~Xiaoling_Hu1",
+      "~Chao_Chen1",
+      "~Li_Fuxin1"
+    ],
+    "rank": 521
+  },
+  {
+    "url": "https://openreview.net/forum?id=DiQD7FWL233",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.42",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Relational regularized autoencoder (RAE) is a framework to learn the distribution of data by minimizing a reconstruction loss together with a relational regularization on the prior of latent space. A recent attempt to reduce the inner discrepancy between the prior and aggregated posterior distributions is to incorporate sliced fused Gromov-Wasserstein (SFG) between these distributions. That approach has a weakness since it treats every slicing direction similarly, meanwhile several directions are not useful for the discriminative task. To improve the discrepancy and consequently the relational regularization, we propose a new relational discrepancy, named spherical sliced fused Gromov Wasserstein (SSFG), that can find an important area of projections characterized by a von Mises-Fisher distribution. Then, we introduce two variants of SSFG to improve its performance. The first variant, named mixture spherical sliced fused Gromov Wasserstein (MSSFG), replaces the vMF distribution by a mixture of von Mises-Fisher distributions to capture multiple important areas of directions that are far from each other. The second variant, named power spherical sliced fused Gromov Wasserstein (PSSFG), replaces the vMF distribution by a power spherical distribution to improve the sampling time of the vMF distribution in high dimension settings. We then apply the new discrepancies to the RAE framework to achieve its new variants. Finally, we conduct extensive experiments to show that the new autoencoders have favorable performance in learning latent manifold structure, image generation, and reconstruction.",
+    "title": "Improving Relational Regularized Autoencoders with Spherical Sliced Fused Gromov Wasserstein",
+    "authors": [
+      "Khai Nguyen",
+      "Son Nguyen",
+      "Nhat Ho",
+      "Tung Pham",
+      "Hung Bui"
+    ],
+    "emails": [
+      "vinai.io",
+      "~Khai_Nguyen1",
+      "~Nhat_Ho1",
+      "~Hung_Bui1"
+    ],
+    "rank": 522
+  },
+  {
+    "url": "https://openreview.net/forum?id=7pgFL2Dkyyy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      3,
+      7,
+      8,
+      7
+    ],
+    "rating": "6.42",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Normalization techniques have proved to be a crucial ingredient of successful training in a traditional supervised learning regime. However, in the zero-shot learning (ZSL) world, these ideas have received only marginal attention. This work studies normalization in ZSL scenario from both theoretical and practical perspectives. First, we give a theoretical explanation to two popular tricks used in zero-shot learning: normalize+scale and attributes normalization and show that they help training by preserving variance during a forward pass. Next, we demonstrate that they are insufficient to normalize a deep ZSL model and propose Class Normalization (CN): a normalization scheme, which alleviates this issue both provably and in practice. Third, we show that ZSL models typically have more irregular loss surface compared to traditional classifiers and that the proposed method partially remedies this problem. Then, we test our approach on 4 standard ZSL datasets and outperform sophisticated modern SotA with a simple MLP optimized without any bells and whistles and having ~50 times faster training speed. Finally, we generalize ZSL to a broader problem \u2014 continual ZSL, and introduce some principled metrics and rigorous baselines for this new setup. The source code is available at <a href=\"https://github.com/universome/class-norm\" target=\"_blank\" rel=\"nofollow\">https://github.com/universome/class-norm</a>.",
+    "title": "Class Normalization for (Continual)? Generalized Zero-Shot Learning",
+    "authors": [
+      "Ivan Skorokhodov",
+      "Mohamed Elhoseiny"
+    ],
+    "emails": [
+      "~Mohamed_Elhoseiny1",
+      "~Ivan_Skorokhodov1"
+    ],
+    "rank": 523
+  },
+  {
+    "url": "https://openreview.net/forum?id=VErQxgyrbfn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      9,
+      6
+    ],
+    "rating": "6.41",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Neural networks have shown tremendous potential for reconstructing high-resolution images in inverse problems. The non-convex and opaque nature of neural networks, however, hinders their utility in sensitive applications such as medical imaging. To cope with this challenge, this paper advocates a convex duality framework that makes a two-layer fully-convolutional ReLU denoising network amenable to convex optimization. The convex dual network not only offers the optimum training with convex solvers, but also facilitates interpreting training and prediction. In particular, it implies training neural networks with weight decay regularization induces path sparsity while the prediction is piecewise linear filtering. A range of experiments with MNIST and fastMRI datasets confirm the efficacy of the dual network optimization problem. ",
+    "title": "Convex Regularization behind Neural Reconstruction",
+    "authors": [
+      "Arda Sahiner",
+      "Morteza Mardani",
+      "Batu Ozturkler",
+      "Mert Pilanci",
+      "John M. Pauly"
+    ],
+    "emails": [
+      "~John_M._Pauly1",
+      "stanford.edu",
+      "~Morteza_Mardani1",
+      "~Arda_Sahiner1",
+      "~Mert_Pilanci3"
+    ],
+    "rank": 524
+  },
+  {
+    "url": "https://openreview.net/forum?id=mQPBmvyAuk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.40",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "We develop a methodology for assessing the robustness of models to subpopulation shift---specifically, their ability to generalize to novel data subpopulations that were not observed during training. Our approach leverages the class structure underlying existing datasets to control the data subpopulations that comprise the training and test distributions. This enables us to synthesize realistic distribution shifts whose sources can be precisely controlled and characterized, within existing\nlarge-scale datasets. Applying this methodology to the ImageNet dataset, we create a suite of subpopulation shift benchmarks of varying granularity. We then validate that the corresponding shifts are tractable by obtaining human baselines. Finally, we utilize these benchmarks to measure the sensitivity of standard model architectures as well as the effectiveness of existing train-time robustness interventions. ",
+    "title": "BREEDS: Benchmarks for Subpopulation Shift",
+    "authors": [
+      "Shibani Santurkar",
+      "Dimitris Tsipras",
+      "Aleksander Madry"
+    ],
+    "emails": [
+      "~Shibani_Santurkar1",
+      "~Dimitris_Tsipras1",
+      "~Aleksander_Madry1"
+    ],
+    "rank": 525
+  },
+  {
+    "url": "https://openreview.net/forum?id=bEoxzW_EXsa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      5
+    ],
+    "rating": "6.40",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "We propose a novel end-to-end non-minimax algorithm for training optimal transport mappings for the quadratic cost (Wasserstein-2 distance). The algorithm uses input convex neural networks and a cycle-consistency regularization to approximate Wasserstein-2 distance. In contrast to popular entropic and quadratic regularizers, cycle-consistency does not introduce bias and scales well to high dimensions. From the theoretical side, we estimate the properties of the generative mapping fitted by our algorithm. From the practical side, we evaluate our algorithm on a wide range of tasks: image-to-image color transfer, latent space optimal transport, image-to-image style transfer, and domain adaptation.",
+    "title": "Wasserstein-2 Generative Networks",
+    "authors": [
+      "Alexander Korotin",
+      "Vage Egiazarian",
+      "Arip Asadulaev",
+      "Alexander Safin",
+      "Evgeny Burnaev"
+    ],
+    "emails": [
+      "~Vage_Egiazarian1",
+      "~Alexander_Korotin2",
+      "skoltech.ru",
+      "~Arip_Asadulaev1",
+      "~Evgeny_Burnaev1"
+    ],
+    "rank": 526
+  },
+  {
+    "url": "https://openreview.net/forum?id=n7wIfYPdVet",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.40",
+    "confidences": [
+      3,
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Training neural networks with auxiliary tasks is a common practice for improving the performance on a main task of interest.\nTwo main challenges arise in this multi-task learning setting: (i) designing useful auxiliary tasks; and (ii) combining auxiliary tasks into a single coherent loss. Here, we propose a novel framework, AuxiLearn, that targets both challenges based on implicit differentiation. First, when useful auxiliaries are known, we propose learning a network that combines all losses into a single coherent objective function. This network can learn non-linear interactions between tasks. Second, when no useful auxiliary task is known, we describe how to learn a network that generates a meaningful, novel auxiliary task. We evaluate AuxiLearn in a series of tasks and domains, including image segmentation and learning with attributes in the low data regime, and find that it consistently outperforms competing methods.",
+    "title": "Auxiliary Learning by Implicit Differentiation",
+    "authors": [
+      "Aviv Navon",
+      "Idan Achituve",
+      "Haggai Maron",
+      "Gal Chechik",
+      "Ethan Fetaya"
+    ],
+    "emails": [
+      "~Gal_Chechik1",
+      "~Aviv_Navon1",
+      "~Idan_Achituve1",
+      "~Ethan_Fetaya1",
+      "~Haggai_Maron1"
+    ],
+    "rank": 527
+  },
+  {
+    "url": "https://openreview.net/forum?id=7aL-OtQrBWD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7
+    ],
+    "rating": "6.40",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we explore connections between interpretable machine learning and learning theory through the lens of local approximation explanations. First, we tackle the traditional problem of performance generalization and bound the test-time predictive accuracy of a model using a notion of how locally explainable it is.  Second, we explore the novel problem of explanation generalization which is an important concern for a growing class of finite sample-based local approximation explanations. Finally, we validate our theoretical results empirically and show that they reflect what can be seen in practice.",
+    "title": "A Learning Theoretic Perspective on Local Explainability",
+    "authors": [
+      "Jeffrey Li",
+      "Vaishnavh Nagarajan",
+      "Gregory Plumb",
+      "Ameet Talwalkar"
+    ],
+    "emails": [
+      "~Ameet_Talwalkar1",
+      "~Jeffrey_Li1",
+      "~Vaishnavh_Nagarajan3",
+      "~Gregory_Plumb2"
+    ],
+    "rank": 528
+  },
+  {
+    "url": "https://openreview.net/forum?id=GMgHyUPrXa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      7,
+      4
+    ],
+    "rating": "6.40",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In recent years, great success has been witnessed in building problem-specific deep networks from unrolling iterative algorithms, for solving inverse problems and beyond. Unrolling is believed to incorporate the model-based prior with the learning capacity of deep learning. This paper revisits \\textit{the role of unrolling as a design approach for deep networks}: to what extent its resulting special architecture is superior, and can we find better? Using LISTA for sparse recovery as a representative example, we conduct the first thorough \\textit{design space study} for the unrolled models.  Among all possible variations, we focus on extensively varying the connectivity patterns and neuron types, leading to a gigantic design space arising from LISTA. To efficiently explore this space and identify top performers, we leverage the emerging tool of neural architecture search (NAS). We carefully examine the searched top architectures in a number of settings, and are able to discover networks that consistently better than LISTA. We further present more visualization and analysis to ``open the black box\", and find that the searched top architectures demonstrate highly consistent and potentially transferable patterns. We hope our study to spark more reflections and explorations on how to better mingle model-based optimization prior and data-driven learning.",
+    "title": "A Design Space Study for LISTA and Beyond",
+    "authors": [
+      "Tianjian Meng",
+      "Xiaohan Chen",
+      "Yifan Jiang",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Zhangyang_Wang1",
+      "~Yifan_Jiang2",
+      "~Tianjian_Meng2",
+      "~Xiaohan_Chen1"
+    ],
+    "rank": 529
+  },
+  {
+    "url": "https://openreview.net/forum?id=SHvF5xaueVn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      4,
+      7
+    ],
+    "rating": "6.40",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We tackle a challenging blind image denoising problem, in which only single distinct noisy images are available for training a denoiser, and no information about noise is known, except for it being zero-mean, additive, and independent of the clean image. In such a setting, which often occurs in practice, it is not possible to train a denoiser with the standard discriminative training or with the recently developed Noise2Noise (N2N) training; the former requires the underlying clean image for the given noisy image, and the latter requires two independently realized noisy image pair for a clean image. To that end, we propose GAN2GAN (Generated-Artificial-Noise to Generated-Artificial-Noise) method that first learns a generative model that can 1) simulate the noise in the given noisy images and 2) generate a rough, noisy estimates of the clean images, then 3) iteratively trains a denoiser with subsequently synthesized noisy image pairs (as in N2N), obtained from the generative model. In results, we show the denoiser trained with our GAN2GAN achieves an impressive denoising performance on both synthetic and real-world datasets for the blind denoising setting; it almost approaches the performance of the standard discriminatively-trained or N2N-trained models that have more information than ours, and it significantly outperforms the recent baseline for the same setting, \\textit{e.g.}, Noise2Void, and a more conventional yet strong one, BM3D. The official code of our method is available at <a href=\"https://github.com/csm9493/GAN2GAN\" target=\"_blank\" rel=\"nofollow\">https://github.com/csm9493/GAN2GAN</a>.",
+    "title": "GAN2GAN: Generative Noise Learning for Blind Denoising with Single Noisy Images",
+    "authors": [
+      "Sungmin Cha",
+      "Taeeon Park",
+      "Byeongjoon Kim",
+      "Jongduk Baek",
+      "Taesup Moon"
+    ],
+    "emails": [
+      "naver.com",
+      "~Taesup_Moon1",
+      "skku.edu",
+      "yonsei.ac.kr",
+      "~Sungmin_Cha1"
+    ],
+    "rank": 530
+  },
+  {
+    "url": "https://openreview.net/forum?id=8qDwejCuCN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.40",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Time series are often complex and rich in information but sparsely labeled and therefore challenging to model. In this paper, we propose a self-supervised framework for learning robust and generalizable representations for time series. Our approach, called Temporal Neighborhood Coding (TNC), takes advantage of the local smoothness of a signal's generative process to define neighborhoods in time with stationary properties. Using a debiased contrastive objective, our framework learns time series representations by ensuring that in the encoding space, the distribution of signals from within a neighborhood is distinguishable from the distribution of non-neighboring signals. Our motivation stems from the medical field, where the ability to model the dynamic nature of time series data is especially valuable for identifying, tracking, and predicting the underlying patients' latent states in settings where labeling data is practically impossible. We compare our method to recently developed unsupervised representation learning approaches and demonstrate superior performance on clustering and classification tasks for multiple datasets.",
+    "title": "Unsupervised Representation Learning for Time Series with Temporal Neighborhood Coding",
+    "authors": [
+      "Sana Tonekaboni",
+      "Danny Eytan",
+      "Anna Goldenberg"
+    ],
+    "emails": [
+      "~Anna_Goldenberg1",
+      "gmail.com",
+      "~Sana_Tonekaboni1"
+    ],
+    "rank": 531
+  },
+  {
+    "url": "https://openreview.net/forum?id=pAbm1qfheGk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.40",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We study how to generate molecule conformations (i.e., 3D structures) from a molecular graph. Traditional methods, such as molecular dynamics, sample conformations via computationally expensive simulations. Recently, machine learning methods have shown great potential by training on a large collection of conformation data. Challenges arise from the limited model capacity for capturing complex distributions of conformations and the difficulty in modeling long-range dependencies between atoms. Inspired by the recent progress in deep generative models, in this paper, we propose a novel probabilistic framework to generate valid and diverse conformations given a molecular graph. We propose a method combining the advantages of both flow-based and energy-based models, enjoying: (1) a high model capacity to estimate the multimodal conformation distribution; (2) explicitly capturing the complex long-range dependencies between atoms in the observation space. Extensive experiments demonstrate the superior performance of the proposed method on several benchmarks, including conformation generation and distance modeling tasks, with a significant improvement over existing generative models for molecular conformation sampling.",
+    "title": "Learning Neural Generative Dynamics for Molecular Conformation Generation",
+    "authors": [
+      "Minkai Xu",
+      "Shitong Luo",
+      "Yoshua Bengio",
+      "Jian Peng",
+      "Jian Tang"
+    ],
+    "emails": [
+      "~Jian_Peng1",
+      "~Yoshua_Bengio1",
+      "~Shitong_Luo1",
+      "~Jian_Tang1",
+      "~Minkai_Xu1"
+    ],
+    "rank": 532
+  },
+  {
+    "url": "https://openreview.net/forum?id=-N7PBXqOUJZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.40",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Viewing recurrent neural networks (RNNs) as continuous-time dynamical systems, we propose a recurrent unit that describes the hidden state's evolution with two parts: a well-understood linear component plus a Lipschitz nonlinearity. This particular functional form facilitates stability analysis of the long-term behavior of the recurrent unit using tools from nonlinear systems theory. In turn, this enables architectural design decisions before experimentation. Sufficient conditions for global stability of the recurrent unit are obtained, motivating a novel scheme for constructing hidden-to-hidden matrices. Our experiments demonstrate that the Lipschitz RNN can outperform existing recurrent units on a range of benchmark tasks, including computer vision, language modeling and speech prediction tasks. Finally, through Hessian-based analysis we demonstrate that our Lipschitz recurrent unit is more robust with respect to input and parameter perturbations as compared to other continuous-time RNNs.",
+    "title": "Lipschitz Recurrent Neural Networks",
+    "authors": [
+      "N. Benjamin Erichson",
+      "Omri Azencot",
+      "Alejandro Queiruga",
+      "Liam Hodgkinson",
+      "Michael W. Mahoney"
+    ],
+    "emails": [
+      "~N._Benjamin_Erichson1",
+      "bgu.ac.il",
+      "~Michael_W._Mahoney1",
+      "google.com",
+      "~Liam_Hodgkinson1"
+    ],
+    "rank": 533
+  },
+  {
+    "url": "https://openreview.net/forum?id=7uVcpu-gMD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.40",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Neural networks (NNs) whose subnetworks implement reusable functions are expected to offer numerous advantages, including compositionality through efficient recombination of functional building blocks, interpretability, preventing catastrophic interference, etc. Understanding if and how NNs are modular could provide insights into how to improve them. Current inspection methods, however, fail to link modules to their functionality. In this paper, we present a novel method based on learning binary weight masks to identify individual weights and subnets responsible for specific functions. Using this powerful tool, we contribute an extensive study of emerging modularity in NNs that covers several standard architectures and datasets. We demonstrate how common NNs fail to reuse submodules and offer new insights into the related issue of systematic generalization on language tasks.",
+    "title": "Are Neural Nets Modular? Inspecting Functional Modularity Through Differentiable Weight Masks",
+    "authors": [
+      "R\u00f3bert Csord\u00e1s",
+      "Sjoerd van Steenkiste",
+      "J\u00fcrgen Schmidhuber"
+    ],
+    "emails": [
+      "~J\u00fcrgen_Schmidhuber1",
+      "~Sjoerd_van_Steenkiste1",
+      "~R\u00f3bert_Csord\u00e1s1"
+    ],
+    "rank": 534
+  },
+  {
+    "url": "https://openreview.net/forum?id=rsogjAnYs4z",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7
+    ],
+    "rating": "6.40",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We study two factors in neural network training: data parallelism and sparsity; here, data parallelism means processing training data in parallel using distributed systems (or equivalently increasing batch size), so that training can be accelerated; for sparsity, we refer to pruning parameters in a neural network model, so as to reduce computational and memory cost. Despite their promising benefits, however, understanding of their effects on neural network training remains elusive. In this work, we first measure these effects rigorously by conducting extensive experiments while tuning all metaparameters involved in the optimization. As a result, we find across various workloads of data set, network model, and optimization algorithm that there exists a general scaling trend between batch size and number of training steps to convergence for the effect of data parallelism, and further, difficulty of training under sparsity. Then, we develop a theoretical analysis based on the convergence properties of stochastic gradient methods and smoothness of the optimization landscape, which illustrates the observed phenomena precisely and generally, establishing a better account of the effects of data parallelism and sparsity on neural network training.",
+    "title": "Understanding the effects of data parallelism and sparsity on neural network training",
+    "authors": [
+      "Namhoon Lee",
+      "Thalaiyasingam Ajanthan",
+      "Philip Torr",
+      "Martin Jaggi"
+    ],
+    "emails": [
+      "~Namhoon_Lee1",
+      "~Thalaiyasingam_Ajanthan1",
+      "~Philip_Torr1",
+      "~Martin_Jaggi1"
+    ],
+    "rank": 535
+  },
+  {
+    "url": "https://openreview.net/forum?id=eEn8KTtJOx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.40",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "With the thriving of deep learning and the widespread practice of using pre-trained networks, backdoor attacks have become an increasing security threat drawing many research interests in recent years. A third-party model can be poisoned in training to work well in normal conditions but behave maliciously when a trigger pattern appears. However, the existing backdoor attacks are all built on noise perturbation triggers, making them noticeable to humans. In this paper, we instead propose using warping-based triggers. The proposed backdoor outperforms the previous methods in a human inspection test by a wide margin, proving its stealthiness. To make such models undetectable by machine defenders, we propose a novel training mode, called the ``noise mode. The trained networks successfully attack and bypass the state-ofthe art defense methods on standard classification datasets, including MNIST, CIFAR-10, GTSRB, and CelebA. Behavior analyses show that our backdoors are transparent to network inspection, further proving this novel attack mechanism's efficiency.",
+    "title": "WaNet - Imperceptible Warping-based Backdoor Attack",
+    "authors": [
+      "Tuan Anh Nguyen",
+      "Anh Tuan Tran"
+    ],
+    "emails": [
+      "~Anh_Tuan_Tran2",
+      "~Tuan_Anh_Nguyen4"
+    ],
+    "rank": 536
+  },
+  {
+    "url": "https://openreview.net/forum?id=sTeoJiB4uR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      6,
+      8
+    ],
+    "rating": "6.39",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Deep generative models provide a powerful set of tools to understand real-world data. But as these models improve, they increase in size and complexity, so their computational cost in memory and execution time grows. Using binary weights in neural networks is one method which has shown promise in reducing this cost. However, whether binary neural networks can be used in generative models is an open problem. In this work we show, for the first time, that we can successfully train generative models which utilize binary neural networks. This reduces the computational cost of the models massively. We develop a new class of binary weight normalization, and provide insights for architecture designs of these binarized generative models. We demonstrate that two state-of-the-art deep generative models, the ResNet VAE and Flow++ models, can be binarized effectively using these techniques. We train binary models that achieve loss values close to those of the regular models but are 90%-94% smaller in size, and also allow significant speed-ups in execution time.",
+    "title": "Reducing the Computational Cost of Deep Generative Models with Binary Neural Networks",
+    "authors": [
+      "Thomas Bird",
+      "Friso Kingma",
+      "David Barber"
+    ],
+    "emails": [
+      "~Thomas_Bird1",
+      "gmail.com",
+      "~David_Barber1"
+    ],
+    "rank": 537
+  },
+  {
+    "url": "https://openreview.net/forum?id=Xa3iM4C1nqd",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7
+    ],
+    "rating": "6.38",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Robustness is an important, and yet, under-explored aspect of unsupervised representation learning, which has seen a lot of recent developments. In this work, we address this gap by developing a novel framework: Unsupervised Robust Representation Learning (URRL), which combines unsupervised representation learning's pretext task and robust supervised learning (e.g., AugMix).  Moreover, it is commonly assumed that there needs to be a trade-off between natural accuracy (on clean data) and robust accuracy (on corrupted data). We upend this view and show that URRL improves both the natural accuracy of unsupervised representation learning and its robustness to corruptions and adversarial noise. A further challenge is that the robustness of a representation might not be preserved in the transfer learning process after fine-tuning on downstream tasks. We develop transferable robustness by proposing a task-agnostic similarity regularization during the fine-tuning process. We show that this improves the robustness of the resulting model without the need for any adversarial training or further data augmentation during fine-tuning.",
+    "title": "Transferable Unsupervised Robust Representation Learning",
+    "authors": [
+      "De-An Huang",
+      "Zhiding Yu",
+      "Anima Anandkumar"
+    ],
+    "emails": [
+      "~Anima_Anandkumar1",
+      "~De-An_Huang1",
+      "~Zhiding_Yu1"
+    ],
+    "rank": 538
+  },
+  {
+    "url": "https://openreview.net/forum?id=Uu1Nw-eeTxJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7
+    ],
+    "rating": "6.38",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Recent studies have demonstrated the overwhelming advantage of cross-lingual pre-trained models (PTMs), such as multilingual BERT and XLM, on cross-lingual NLP tasks. However, existing approaches essentially capture the co-occurrence among tokens through involving the masked language model (MLM) objective with token-level cross entropy. In this work, we extend these approaches to learn sentence-level representations and show the effectiveness on cross-lingual understanding and generation. Specifically, we propose a Hierarchical Contrastive Learning (HiCTL) method to (1) learn universal representations for parallel sentences distributed in one or multiple languages and (2) distinguish the semantically-related words from a shared cross-lingual vocabulary for each sentence. We conduct evaluations on two challenging cross-lingual tasks, XTREME and machine translation. Experimental results show that the HiCTL outperforms the state-of-the-art XLM-R by an absolute gain of 4.2% accuracy on the XTREME benchmark as well as achieves substantial improvements on both of the high resource and low-resource English<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">\u2192</mo></math></mjx-assistive-mml></mjx-container>X translation tasks over strong baselines.",
+    "title": "On Learning Universal Representations Across Languages",
+    "authors": [
+      "Xiangpeng Wei",
+      "Rongxiang Weng",
+      "Yue Hu",
+      "Luxi Xing",
+      "Heng Yu",
+      "Weihua Luo"
+    ],
+    "emails": [
+      "iie.ac.cn",
+      "~Xiangpeng_Wei1",
+      "alibaba-inc.com"
+    ],
+    "rank": 539
+  },
+  {
+    "url": "https://openreview.net/forum?id=-Hs_otp2RB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.38",
+    "confidences": [
+      3,
+      3,
+      5,
+      2
+    ],
+    "abstract": "Variational autoencoders (VAEs) have recently been shown to be vulnerable to adversarial attacks, wherein they are fooled into reconstructing a chosen target image. However, how to defend against such attacks remains an open problem. We make significant advances in addressing this issue by introducing methods for producing adversarially robust VAEs. Namely, we first demonstrate that methods proposed to obtain disentangled latent representations produce VAEs that are more robust to these attacks. However, this robustness comes at the cost of reducing the quality of the reconstructions. We ameliorate this by applying disentangling methods to hierarchical VAEs. The resulting models produce high--fidelity autoencoders that are also adversarially robust. We confirm their capabilities on several different datasets and with current state-of-the-art VAE adversarial attacks, and also show that they increase the robustness of downstream tasks to attack.",
+    "title": "Improving VAEs' Robustness to Adversarial Attack",
+    "authors": [
+      "Matthew JF Willetts",
+      "Alexander Camuto",
+      "Tom Rainforth",
+      "S Roberts",
+      "Christopher C Holmes"
+    ],
+    "emails": [
+      "~Matthew_JF_Willetts1",
+      "turing.ac.uk",
+      "~Tom_Rainforth1",
+      "stats.ox.ac.uk",
+      "~S_Roberts1"
+    ],
+    "rank": 540
+  },
+  {
+    "url": "https://openreview.net/forum?id=TVbDOOr6hL",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.38",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "This paper provides a generative approach for causal inference using data from observational studies. Inspired by the work of Kingma et al. (2014), we propose a sequence of three architectures (namely Series, Parallel, and Hybrid) that each incorporate their M1 and M2 models as building blocks. Each architecture is an improvement over the previous one in terms of estimating causal effect, culminating in the Hybrid model. The Hybrid model is designed to encourage decomposing the underlying factors of any observational dataset; this in turn, helps to accurately estimate all treatment outcomes. Our empirical results demonstrate the superiority of all three proposed architectures compared to both state-of-the-art discriminative as well as other generative approaches in the literature. ",
+    "title": "Variational Auto-Encoder Architectures that Excel at Causal Inference",
+    "authors": [
+      "Negar Hassanpour",
+      "Russell Greiner"
+    ],
+    "emails": [
+      "~Negar_Hassanpour1",
+      "~Russell_Greiner2"
+    ],
+    "rank": 541
+  },
+  {
+    "url": "https://openreview.net/forum?id=gHsr-v8Tz6l",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      8
+    ],
+    "rating": "6.38",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Domain generalization addresses the out-of-distribution problem, which is challenging due to the domain shift and the uncertainty caused by the inaccessibility to data from the target domains. In this paper, we propose variational invariant learning, a probabilistic inference framework that jointly models domain invariance and uncertainty. We introduce variational Bayesian approximation into both the feature representation and classifier layers to facilitate invariant learning for better generalization across domains. In the probabilistic modeling framework, we introduce a domain-invariant principle to explore invariance across domains in a unified way. We incorporate the principle into the variational Bayesian layers in neural networks, achieving domain-invariant representations and classifier. We empirically demonstrate the effectiveness of our proposal on four widely used cross-domain visual recognition benchmarks. Ablation studies demonstrate the benefits of our proposal and on all benchmarks our variational invariant learning consistently delivers state-of-the-art performance. ",
+    "title": "Variational Invariant Learning for Bayesian Domain Generalization",
+    "authors": [
+      "Zehao Xiao",
+      "Jiayi Shen",
+      "Xiantong Zhen",
+      "Ling Shao",
+      "Cees G. M. Snoek"
+    ],
+    "emails": [
+      "~Jiayi_Shen3",
+      "~Xiantong_Zhen1",
+      "~Cees_G._M._Snoek1",
+      "~Zehao_Xiao1",
+      "~Ling_Shao1"
+    ],
+    "rank": 542
+  },
+  {
+    "url": "https://openreview.net/forum?id=98fWAc-sFkv",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      4,
+      6,
+      7
+    ],
+    "rating": "6.38",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Continual Learning is a learning paradigm where learning systems are trained on a sequence of tasks. The goal here is to perform well on the current task without suffering from a performance drop on the previous tasks. Two notable directions among the recent advances in continual learning with neural networks are (1) variational Bayes based regularization by learning priors from previous tasks, and, (2) learning the structure of deep networks to adapt to new tasks. So far, these two approaches have been orthogonal. We present a novel Bayesian framework for continual learning based on learning the structure of deep neural networks, addressing the shortcomings of both these approaches. The proposed framework learns the deep structure for each task by learning which weights to be used, and supports inter-task transfer through the overlapping of different sparse subsets of weights learned by different tasks. An appealing aspect of our proposed continual learning framework is that it is applicable to both discriminative (supervised) and generative (unsupervised) settings. Experimental results on supervised and unsupervised benchmarks shows that our model performs comparably or better than recent advances in continual learning.",
+    "title": "A Unified Bayesian Framework for Discriminative and Generative Continual Learning",
+    "authors": [
+      "Abhishek Kumar",
+      "Sunabha Chatterjee",
+      "Piyush Rai"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Piyush_Rai1"
+    ],
+    "rank": 543
+  },
+  {
+    "url": "https://openreview.net/forum?id=uKhGRvM8QNH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.38",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Knowledge distillation, in which a student model is trained to mimic a teacher model, has been proved as an effective technique for model compression and model accuracy boosting. However, most knowledge distillation methods, designed for image classification, have failed on more challenging tasks, such as object detection. In this paper, we suggest that the failure of knowledge distillation on object detection is mainly caused by two reasons: (1) the imbalance between pixels of foreground and background and (2) lack of distillation on the relation between different pixels. Observing the above reasons, we propose attention-guided distillation and non-local distillation to address the two problems, respectively.  Attention-guided distillation is proposed to find the crucial pixels of foreground objects with attention mechanism and then make the students take more effort to learn their features. Non-local distillation is proposed to enable students to learn not only the feature of an individual pixel but also the relation between different pixels captured by non-local modules. Experiments show that our methods achieve excellent AP improvements on both one-stage and two-stage, both anchor-based and anchor-free detectors. For example, Faster RCNN (ResNet101 backbone) with our distillation achieves 43.9 AP on COCO2017, which is 4.1 higher than the baseline. Codes have been released on Github.",
+    "title": "Improve Object Detection with Feature-based Knowledge Distillation: Towards Accurate and Efficient Detectors",
+    "authors": [
+      "Linfeng Zhang",
+      "Kaisheng Ma"
+    ],
+    "emails": [
+      "~Kaisheng_Ma1",
+      "~Linfeng_Zhang2"
+    ],
+    "rank": 544
+  },
+  {
+    "url": "https://openreview.net/forum?id=BUlyHkzjgmA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.38",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Concentration of measure has been argued to be the fundamental cause of adversarial vulnerability. Mahloujifar et al. (2019) presented an empirical way to measure the concentration of a data distribution using samples, and employed it to find lower bounds on intrinsic robustness for several benchmark datasets. However, it remains unclear whether these lower bounds are tight enough to provide a useful approximation for the intrinsic robustness of a dataset. To gain a deeper understanding of the concentration of measure phenomenon, we first extend the Gaussian Isoperimetric Inequality to non-spherical Gaussian measures and arbitrary <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container>-norms (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo>\u2265</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container>). We leverage these theoretical insights to design a method that uses half-spaces to estimate the concentration of any empirical dataset under <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container>-norm distance metrics. Our proposed algorithm is more efficient than Mahloujifar et al. (2019)'s, and experiments on synthetic datasets and image benchmarks demonstrate that it is able to find much tighter intrinsic robustness bounds. These tighter estimates provide further evidence that rules out intrinsic dataset concentration as a possible explanation for the adversarial vulnerability of state-of-the-art classifiers.",
+    "title": "Improved Estimation of Concentration Under \u2113p-Norm Distance Metrics Using Half Spaces",
+    "authors": [
+      "Jack Prescott",
+      "Xiao Zhang",
+      "David Evans"
+    ],
+    "emails": [
+      "virginia.edu",
+      "~David_Evans1",
+      "~Xiao_Zhang2"
+    ],
+    "rank": 545
+  },
+  {
+    "url": "https://openreview.net/forum?id=iaO86DUuKi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.38",
+    "confidences": [
+      3,
+      3,
+      2,
+      5
+    ],
+    "abstract": "Safe exploration presents a major challenge in reinforcement learning (RL): when active data collection requires deploying partially trained policies, we must ensure that these policies avoid catastrophically unsafe regions, while still enabling trial and error learning. In this paper, we target the problem of safe exploration in RL, by learning a conservative safety estimate of environment states through a critic, and provably upper bound the likelihood of catastrophic failures at every training iteration. We theoretically characterize the tradeoff between safety and policy improvement, show that the safety constraints are satisfied with high probability during training, derive provable convergence guarantees for our approach which is no worse asymptotically then standard RL, and empirically demonstrate the efficacy of the proposed approach on a suite of challenging navigation, manipulation, and locomotion tasks. Our results demonstrate that the proposed approach can achieve competitive task performance, while incurring significantly lower catastrophic failure rates during training as compared to prior methods. Videos are at this URL <a href=\"https://sites.google.com/view/conservative-safety-critics/\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/conservative-safety-critics/</a>",
+    "title": "Conservative Safety Critics for Exploration",
+    "authors": [
+      "Homanga Bharadhwaj",
+      "Aviral Kumar",
+      "Nicholas Rhinehart",
+      "Sergey Levine",
+      "Florian Shkurti",
+      "Animesh Garg"
+    ],
+    "emails": [
+      "~Aviral_Kumar2",
+      "~Nicholas_Rhinehart1",
+      "~Animesh_Garg1",
+      "~Florian_Shkurti1",
+      "~Sergey_Levine1",
+      "~Homanga_Bharadhwaj1"
+    ],
+    "rank": 546
+  },
+  {
+    "url": "https://openreview.net/forum?id=3q5IqUrkcF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.38",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Gradient descent can be surprisingly good at optimizing deep neural networks without overfitting and without explicit regularization. We find that the discrete steps of gradient descent implicitly regularize models by penalizing gradient descent trajectories that have large loss gradients. We call this Implicit Gradient Regularization (IGR) and we use backward error analysis to calculate the size of this regularization. We confirm empirically that implicit gradient regularization biases gradient descent toward flat minima, where test errors are small and solutions are robust to noisy parameter perturbations. Furthermore, we demonstrate that the implicit gradient regularization term can be used as an explicit regularizer, allowing us to control this gradient regularization directly. More broadly, our work indicates that backward error analysis is a useful theoretical approach to the perennial question of how learning rate, model size, and parameter regularization interact to determine the properties of overparameterized models optimized with gradient descent.",
+    "title": "Implicit Gradient Regularization",
+    "authors": [
+      "David Barrett",
+      "Benoit Dherin"
+    ],
+    "emails": [
+      "~David_Barrett1",
+      "google.com"
+    ],
+    "rank": 547
+  },
+  {
+    "url": "https://openreview.net/forum?id=gJYlaqL8i8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Experience replay, which enables the agents to remember and reuse experience from the past, has played a significant role in the success of off-policy reinforcement learning (RL). To utilize the experience replay efficiently, the existing sampling methods allow selecting out more meaningful experiences by imposing priorities on them based on certain metrics (e.g. TD-error). However, they may result in sampling highly biased, redundant transitions since they compute the sampling rate for each transition independently, without consideration of its importance in relation to other transitions. In this paper, we aim to address the issue by proposing a new learning-based sampling method that can compute the relative importance of transition. To this end, we design a novel permutation-equivariant neural architecture that takes contexts from not only features of each transition (local) but also those of others (global) as inputs. We validate our framework, which we refer to as Neural Experience Replay Sampler (NERS), on multiple benchmark tasks for both continuous and discrete control tasks and show that it can significantly improve the performance of various off-policy RL methods. Further analysis confirms that the improvements of the sample efficiency indeed are due to sampling diverse and meaningful transitions by NERS that considers both local and global contexts. ",
+    "title": "Learning to Sample with Local and Global Contexts  in Experience Replay Buffer",
+    "authors": [
+      "Youngmin Oh",
+      "Kimin Lee",
+      "Jinwoo Shin",
+      "Eunho Yang",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Kimin_Lee1",
+      "~Youngmin_Oh2",
+      "~Eunho_Yang1",
+      "~Jinwoo_Shin1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 548
+  },
+  {
+    "url": "https://openreview.net/forum?id=vXj_ucZQ4hA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.36",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Overparameterized Neural Networks (NN) display state-of-the-art performance. However, there is a growing need for smaller, energy-efficient, neural networks to be able to use machine learning applications on devices with limited computational resources. A popular approach consists of using pruning techniques. While these techniques have traditionally focused on pruning pre-trained NN (LeCun et al.,1990; Hassibi et al., 1993), recent work by Lee et al. (2018) has shown promising results when pruning at initialization. However, for Deep NNs, such procedures remain unsatisfactory as the resulting pruned networks can be difficult to train and, for instance, they do not prevent one layer from being fully pruned. In this paper, we provide a comprehensive theoretical analysis of Magnitude and Gradient based pruning at initialization and training of sparse architectures.  This allows us to propose novel principled approaches which we validate experimentally on a variety of NN architectures.",
+    "title": "Robust Pruning at Initialization",
+    "authors": [
+      "Soufiane Hayou",
+      "Jean-Francois Ton",
+      "Arnaud Doucet",
+      "Yee Whye Teh"
+    ],
+    "emails": [
+      "~Soufiane_Hayou1",
+      "~Arnaud_Doucet2",
+      "~Jean-Francois_Ton1",
+      "~Yee_Whye_Teh1"
+    ],
+    "rank": 549
+  },
+  {
+    "url": "https://openreview.net/forum?id=8nl0k08uMi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "The properties of individual neurons are often analyzed in order to understand the biological and artificial neural networks in which they're embedded. Class selectivity\u2014typically defined as how different a neuron's responses are across different classes of stimuli or data samples\u2014is commonly used for this purpose. However, it remains an open question whether it is necessary and/or sufficient for deep neural networks (DNNs) to learn class selectivity in individual units. We investigated the causal impact of class selectivity on network function by directly regularizing for or against class selectivity. Using this regularizer to reduce class selectivity across units in convolutional neural networks increased test accuracy by over 2% in ResNet18 and 1% in ResNet50 trained on Tiny ImageNet. For ResNet20 trained on CIFAR10 we could reduce class selectivity by a factor of 2.5 with no impact on test accuracy, and reduce it nearly to zero with only a small (~2%) drop in test accuracy. In contrast, regularizing to increase class selectivity significantly decreased test accuracy across all models and datasets. These results indicate that class selectivity in individual units is neither sufficient nor strictly necessary, and can even impair DNN performance. They also encourage caution when focusing on the properties of single units as representative of the mechanisms by which DNNs function.",
+    "title": "Selectivity considered harmful: evaluating the causal impact of class selectivity in DNNs",
+    "authors": [
+      "Matthew L Leavitt",
+      "Ari S. Morcos"
+    ],
+    "emails": [
+      "~Matthew_L_Leavitt1",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 550
+  },
+  {
+    "url": "https://openreview.net/forum?id=L8BElg6Qldb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.36",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "We present a framework to derive bounds on the test loss of randomized learning algorithms for the case of bounded loss functions. This framework leads to bounds that depend on the conditional information density between the the output hypothesis and the choice of the training set, given a larger set of data samples from which the training set is formed. Furthermore, the bounds pertain to the average test loss as well as to its tail probability, both for the PAC-Bayesian and the single-draw settings. If the conditional information density is bounded uniformly in the size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> of the training set, our bounds decay as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><mi>n</mi></math></mjx-assistive-mml></mjx-container>, which is referred to as a fast rate. This is in contrast with the tail bounds involving conditional information measures available in the literature, which have a less benign <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.281em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><msqrt><mi>n</mi></msqrt></math></mjx-assistive-mml></mjx-container> dependence.  We demonstrate the usefulness of our tail bounds by showing that they lead to estimates of the test loss achievable with several neural network architectures trained on MNIST and Fashion-MNIST that match the state-of-the-art bounds available in the literature.",
+    "title": "Nonvacuous Loss Bounds with Fast Rates for Neural Networks via Conditional Information Measures",
+    "authors": [
+      "Fredrik Hellstr\u00f6m",
+      "Giuseppe Durisi"
+    ],
+    "emails": [
+      "~Fredrik_Hellstr\u00f6m1",
+      "~Giuseppe_Durisi1"
+    ],
+    "rank": 551
+  },
+  {
+    "url": "https://openreview.net/forum?id=MBpHUFrcG2x",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "We introduce Projected Latent Markov Chain Monte Carlo (PL-MCMC), a technique for sampling from the exact conditional distributions learned by normalizing flows. As a conditional sampling method, PL-MCMC enables Monte Carlo Expectation Maximization (MC-EM) training of normalizing flows from incomplete data. Through experimental tests applying normalizing flows to missing data tasks for a variety of data sets, we demonstrate the efficacy of PL-MCMC for conditional sampling from normalizing flows.",
+    "title": "Projected Latent Markov Chain Monte Carlo: Conditional Sampling of Normalizing Flows",
+    "authors": [
+      "Chris Cannella",
+      "Mohammadreza Soltani",
+      "Vahid Tarokh"
+    ],
+    "emails": [
+      "~Chris_Cannella1",
+      "~Mohammadreza_Soltani1",
+      "~Vahid_Tarokh1"
+    ],
+    "rank": 552
+  },
+  {
+    "url": "https://openreview.net/forum?id=EXkD6ZjvJQQ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper investigates the finite-sample prediction risk of the high-dimensional least squares estimator. We derive the central limit theorem for the prediction risk when both the sample size and the number of features tend to infinity. Furthermore, the finite-sample distribution and the confidence interval of the prediction risk are provided. Our theoretical results demonstrate the sample-wise non-monotonicity of the prediction risk and confirm ''more data hurt'' phenomenon.",
+    "title": "Provable More Data Hurt in High Dimensional Least Squares Estimator",
+    "authors": [
+      "Zeng Li",
+      "Chuanlong Xie",
+      "QINWEN WANG"
+    ],
+    "emails": [
+      "sustech.edu.cn",
+      "~QINWEN_WANG1",
+      "~Chuanlong_Xie1"
+    ],
+    "rank": 553
+  },
+  {
+    "url": "https://openreview.net/forum?id=TGFO0DbD_pk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "The combination of Evolutionary Algorithms (EAs) and Deep Reinforcement Learning (DRL) has been recently proposed to merge the benefits of both solutions. Existing mixed approaches, however, have been successfully applied only to actor-critic methods and present significant overhead. We address these issues by introducing a novel mixed framework that exploits a periodical genetic evaluation to soft update the weights of a DRL agent. The resulting approach is applicable with any DRL method and, in a worst-case scenario, it does not exhibit detrimental behaviours. Experiments in robotic applications and continuous control benchmarks demonstrate the versatility of our approach that significantly outperforms prior DRL, EAs, and mixed approaches. Finally, we employ formal verification to confirm the policy improvement, mitigating the inefficient exploration and hyper-parameter sensitivity of DRL.ment, mitigating the inefficient exploration and hyper-parameter sensitivity of DRL.",
+    "title": "Genetic Soft Updates for Policy Evolution in Deep Reinforcement Learning",
+    "authors": [
+      "Enrico Marchesini",
+      "Davide Corsi",
+      "Alessandro Farinelli"
+    ],
+    "emails": [
+      "~Enrico_Marchesini1",
+      "univr.it",
+      "~Alessandro_Farinelli1"
+    ],
+    "rank": 554
+  },
+  {
+    "url": "https://openreview.net/forum?id=zfO1MwBFu-",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Sequential variational autoencoders (VAEs) with global latent variable <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container> have been studied for the purpose of disentangling the global features of data, which is useful in many downstream tasks. To assist the sequential VAEs further in obtaining meaningful <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container>, an auxiliary loss that maximizes the mutual information (MI) between the observation and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container> is often employed. However, by analyzing the sequential VAEs from the information theoretic perspective, we can claim that simply maximizing the MI encourages the latent variables to have redundant information and prevents the disentanglement of global and local features. Based on this analysis, we derive a novel regularization method that makes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container> informative while encouraging the disentanglement. Specifically, the proposed method removes redundant information by minimizing the MI between <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container> and the local features by using adversarial training. In the experiments, we trained state-space and autoregressive model variants using speech and image datasets. The results indicate that the proposed method improves the performance of the downstream classification and data generation tasks, thereby supporting our information theoretic perspective in the learning of global representations.",
+    "title": "Information Theoretic Regularization for Learning Global Features by Sequential VAE",
+    "authors": [
+      "Kei Akuzawa",
+      "Yusuke Iwasawa",
+      "Yutaka Matsuo"
+    ],
+    "emails": [
+      "~Yutaka_Matsuo1",
+      "~Yusuke_Iwasawa1",
+      "~Kei_Akuzawa1"
+    ],
+    "rank": 555
+  },
+  {
+    "url": "https://openreview.net/forum?id=PH5PH9ZO_4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Machine learning (ML) models that learn and predict properties of computer programs are increasingly being adopted and deployed. \nThese models have demonstrated success in applications such as auto-completing code, summarizing large programs, and detecting bugs and malware in programs. \nIn this work, we investigate principled ways to adversarially perturb a computer program to fool such learned models, and thus determine their adversarial robustness. We use program obfuscations, which have conventionally been used to avoid attempts at reverse engineering programs, as adversarial perturbations. These perturbations modify programs in ways that do not alter their functionality but can be crafted to deceive an ML model when making a decision. We provide a general formulation for an adversarial program that allows applying multiple obfuscation transformations to a program in any language. We develop first-order optimization algorithms to  efficiently determine two key aspects -- which parts of the program to transform, and what transformations to use. We show that it is important to optimize both these aspects to generate the best adversarially perturbed program. Due to the discrete nature of this problem, we also propose using randomized smoothing to improve the attack loss landscape to ease optimization. \nWe evaluate our work on Python and Java programs on the problem of program summarization. \nWe show that our best attack proposal achieves a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>52</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> improvement over a state-of-the-art attack generation approach for programs trained on a \\textsc{seq2seq} model.\nWe further show that our formulation is better at training models that are robust to adversarial attacks.",
+    "title": "Generating Adversarial Computer Programs using Optimized Obfuscations",
+    "authors": [
+      "Shashank Srikant",
+      "Sijia Liu",
+      "Tamara Mitrovska",
+      "Shiyu Chang",
+      "Quanfu Fan",
+      "Gaoyuan Zhang",
+      "Una-May O'Reilly"
+    ],
+    "emails": [
+      "~Gaoyuan_Zhang1",
+      "~Una-May_O'Reilly1",
+      "~Shiyu_Chang2",
+      "mit.edu",
+      "~Shashank_Srikant1",
+      "~Quanfu_Fan1",
+      "~Sijia_Liu1"
+    ],
+    "rank": 556
+  },
+  {
+    "url": "https://openreview.net/forum?id=vsU0efpivw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Shapley values have become one of the most popular feature attribution explanation methods. However, most prior work has focused on post-hoc Shapley explanations, which can be computationally demanding due to its exponential time complexity and preclude model regularization based on Shapley explanations during training. Thus, we propose to incorporate Shapley values themselves as latent representations in deep models thereby making Shapley explanations first-class citizens in the modeling paradigm. This intrinsic explanation approach enables layer-wise explanations, explanation regularization of the model during training, and fast explanation computation at test time. We define the Shapley transform that transforms the input into a Shapley representation given a specific function. We operationalize the Shapley transform as a neural network module and construct both shallow and deep networks, called ShapNets, by composing Shapley modules. We prove that our Shallow ShapNets compute the exact Shapley values and our Deep ShapNets maintain the missingness and accuracy properties of Shapley values. We demonstrate on synthetic and real-world datasets that our ShapNets enable layer-wise Shapley explanations, novel Shapley regularizations during training, and fast computation while maintaining reasonable performance. Code is available at <a href=\"https://github.com/inouye-lab/ShapleyExplanationNetworks\" target=\"_blank\" rel=\"nofollow\">https://github.com/inouye-lab/ShapleyExplanationNetworks</a>.",
+    "title": "Shapley Explanation Networks",
+    "authors": [
+      "Rui Wang",
+      "Xiaoqian Wang",
+      "David I. Inouye"
+    ],
+    "emails": [
+      "~Rui_Wang1",
+      "~David_I._Inouye1",
+      "~Xiaoqian_Wang1"
+    ],
+    "rank": 557
+  },
+  {
+    "url": "https://openreview.net/forum?id=3T9iFICe0Y9",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "The study of deep neural networks (DNNs) in the infinite-width limit, via the so-called neural tangent kernel (NTK) approach, has provided new insights into the dynamics of learning, generalization, and the impact of initialization. One key DNN architecture remains to be kernelized, namely, the recurrent neural network (RNN).  In this paper we introduce and study the Recurrent Neural Tangent Kernel (RNTK), which provides new insights into the behavior of  overparametrized RNNs. A key property of the RNTK should greatly benefit practitioners is its ability to compare inputs of different length. To this end, we characterize how the RNTK weights different time steps to form its output under different initialization parameters and nonlinearity choices. A synthetic and 56 real-world data experiments demonstrate that the RNTK offers significant performance gains over other kernels, including standard NTKs, across a wide array of data sets. ",
+    "title": "The Recurrent Neural Tangent Kernel",
+    "authors": [
+      "Sina Alemohammad",
+      "Zichao Wang",
+      "Randall Balestriero",
+      "Richard Baraniuk"
+    ],
+    "emails": [
+      "~Sina_Alemohammad1",
+      "~Zichao_Wang1",
+      "~Richard_Baraniuk1",
+      "~Randall_Balestriero1"
+    ],
+    "rank": 558
+  },
+  {
+    "url": "https://openreview.net/forum?id=_FXqMj7T0QQ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.36",
+    "confidences": [
+      1,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Despite its groundbreaking success in Go and computer games, Monte Carlo Tree Search (MCTS) is computationally expensive as it requires a substantial number of rollouts to construct the search tree, which calls for effective parallelization. However, how to design effective parallel MCTS algorithms has not been systematically studied and remains poorly understood. In this paper, we seek to lay its first theoretical foundation, by examining the potential performance loss caused by parallelization when achieving a desired speedup. In particular, we discover two necessary conditions for achieving a desirable parallelization performance and highlight two of their practical benefits. First, by examining whether existing parallel MCTS algorithms satisfy these conditions, we identify key design principles that should be inherited by future algorithms. Specifically, we find that, although no existing algorithm satisfies both conditions in general (i.e., they are still far from optimal), pre-adjusting the nodes visit counts (used in VL-UCT (Segal, 2010) and WU-UCT (Liu et al., 2020)) is essential. In particular, WU-UCT pre-adjusts visit counts in a way that always satisfies one necessary condition while its other designs meet the other condition when the maximum tree depth is 2. We theoretically establish that they together facilitate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.155em;\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>ln</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi><mo>+</mo><mi>M</mi><mrow><mo>/</mo></mrow><msqrt><mi>ln</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> cumulative regret under the depth-2 setting, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the number of rollouts and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> is the number of workers. A regret of this form is highly desirable, as compared to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>ln</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret incurred by a sequential counterpart, its excess part approaches zero as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> increases. Second, and more importantly, we demonstrate how the proposed necessary conditions can be adopted to design more effective parallel MCTS algorithms. To illustrate this, we propose a new parallel MCTS algorithm, called BU-UCT, by following our theoretical guidelines. The newly proposed algorithm, albeit preliminary, out-performs four competitive baselines on 11 out of 15 Atari games. We hope our theoretical results could inspire future work of more effective parallel MCTS.",
+    "title": "On Effective Parallelization of Monte Carlo Tree Search",
+    "authors": [
+      "Anji Liu",
+      "Yitao Liang",
+      "Ji Liu",
+      "Guy Van den Broeck",
+      "Jianshu Chen"
+    ],
+    "emails": [
+      "~Anji_Liu1",
+      "~Ji_Liu1",
+      "~Jianshu_Chen1",
+      "~Yitao_Liang1",
+      "~Guy_Van_den_Broeck1"
+    ],
+    "rank": 559
+  },
+  {
+    "url": "https://openreview.net/forum?id=tc5qisoB-C",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      7,
+      8,
+      6
+    ],
+    "rating": "6.35",
+    "confidences": [
+      4,
+      3,
+      5,
+      3,
+      2
+    ],
+    "abstract": "We study the problem of predicting and controlling the future state distribution of an autonomous agent. This problem, which can be viewed as a reframing of goal-conditioned reinforcement learning (RL), is centered around learning a conditional probability density function over future states. Instead of directly estimating this density function, we indirectly estimate this density function by training a classifier to predict whether an observation comes from the future. Via Bayes' rule, predictions from our classifier can be transformed into predictions over future states. Importantly, an off-policy variant of our algorithm allows us to predict the future state distribution of a new policy, without collecting new experience. This variant allows us to optimize functionals of a policy's future state distribution, such as the density of reaching a particular goal state. While conceptually similar to Q-learning, our work lays a principled foundation for goal-conditioned RL as density estimation, providing justification for goal-conditioned methods used in prior work. This foundation makes hypotheses about Q-learning, including the optimal goal-sampling ratio, which we confirm experimentally. Moreover, our proposed method is competitive with prior goal-conditioned RL methods.",
+    "title": "C-Learning: Learning to Achieve Goals via Recursive Classification",
+    "authors": [
+      "Benjamin Eysenbach",
+      "Ruslan Salakhutdinov",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Ruslan_Salakhutdinov1",
+      "~Benjamin_Eysenbach1",
+      "~Sergey_Levine1"
+    ],
+    "rank": 560
+  },
+  {
+    "url": "https://openreview.net/forum?id=NQbnPjPYaG6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      7,
+      8
+    ],
+    "rating": "6.35",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Under mild regularity conditions, gradient-based methods converge globally to a critical point in the single-loss setting. This is known to break down for vanilla gradient descent when moving to multi-loss optimization, but can we hope to build some algorithm with global guarantees? We negatively resolve this open problem by proving that desirable convergence properties cannot simultaneously hold for any algorithm. Our result has more to do with the existence of games with no satisfactory outcomes, than with algorithms per se. More explicitly we construct a two-player game with zero-sum interactions whose losses are both coercive and analytic, but whose only simultaneous critical point is a strict maximum. Any 'reasonable' algorithm, defined to avoid strict maxima, will therefore fail to converge. This is fundamentally different from single losses, where coercivity implies existence of a global minimum.  Moreover, we prove that a wide range of existing gradient-based methods almost surely have bounded but non-convergent iterates in a constructed zero-sum game for suitably small learning rates. It nonetheless remains an open question whether such behavior can arise in high-dimensional games of interest to ML practitioners, such as GANs or multi-agent RL.",
+    "title": "On the Impossibility of Global Convergence in Multi-Loss Optimization",
+    "authors": [
+      "Alistair Letcher"
+    ],
+    "emails": [
+      "~Alistair_Letcher1"
+    ],
+    "rank": 561
+  },
+  {
+    "url": "https://openreview.net/forum?id=Cb54AMqHQFP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      6,
+      6
+    ],
+    "rating": "6.35",
+    "confidences": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Network pruning is an effective method to reduce the computational expense of over-parameterized neural networks for deployment on low-resource systems. Recent state-of-the-art techniques for retraining pruned networks such as weight rewinding and learning rate rewinding have been shown to outperform the traditional fine-tuning technique in recovering the lost accuracy (Renda et al., 2020), but so far it is unclear what accounts for such performance. In this work, we conduct extensive experiments to verify and analyze the uncanny effectiveness of learning rate rewinding. We find that the reason behind the success of learning rate rewinding is the usage of a large learning rate. Similar phenomenon can be observed in other learning rate schedules that involve large learning rates, e.g., the 1-cycle learning rate schedule (Smith et al., 2019). By leveraging the right learning rate schedule in retraining, we demonstrate a counter-intuitive phenomenon in that randomly pruned networks could even achieve better performance than methodically pruned networks (fine-tuned with the conventional approach). Our results emphasize the cruciality of the learning rate schedule in pruned network retraining - a detail often overlooked by practitioners during the implementation of network pruning. ",
+    "title": "Network Pruning That Matters:  A Case Study on Retraining Variants",
+    "authors": [
+      "Duong Hoang Le",
+      "Binh-Son Hua"
+    ],
+    "emails": [
+      "~Binh-Son_Hua1",
+      "~Duong_Hoang_Le2"
+    ],
+    "rank": 562
+  },
+  {
+    "url": "https://openreview.net/forum?id=po-DLlBuAuz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      9,
+      7
+    ],
+    "rating": "6.35",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many real-world applications of reinforcement learning (RL) require the agent to learn from a fixed set of trajectories, without collecting new interactions.  Policy optimization under this setting is extremely challenging as: 1) the geometry of the objective function is hard to optimize efficiently; 2) the shift of data distributions causes high noise in the value estimation. In this work, we propose a simple yet effective policy iteration approach to batch RL using global optimization techniques known as continuation.  By constraining the difference between the learned policy and the behavior policy that generates the fixed trajectories, and continuously relaxing the constraint, our method 1) helps the agent escape local optima; 2) reduces the error in policy evaluation in the optimization procedure.   We present results on a variety of control tasks, game environments, and a recommendation task to empirically demonstrate the efficacy of our proposed method.",
+    "title": "Batch Reinforcement Learning Through Continuation Method",
+    "authors": [
+      "Yijie Guo",
+      "Shengyu Feng",
+      "Nicolas Le Roux",
+      "Ed Chi",
+      "Honglak Lee",
+      "Minmin Chen"
+    ],
+    "emails": [
+      "~Yijie_Guo1",
+      "~Nicolas_Le_Roux2",
+      "~Shengyu_Feng1",
+      "~Ed_Chi1",
+      "~Minmin_Chen1",
+      "~Honglak_Lee2"
+    ],
+    "rank": 563
+  },
+  {
+    "url": "https://openreview.net/forum?id=q-cnWaaoUTH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Well-designed molecular representations (fingerprints) are vital to combine medical chemistry and deep learning. Whereas incorporating 3D geometry of molecules (i.e. conformations) in their representations seems beneficial, current 3D algorithms are still in infancy. In this paper, we propose a novel molecular representation algorithm which preserves 3D conformations of molecules with a Molecular Hamiltonian Network (HamNet). In HamNet, implicit positions and momentums of atoms in a molecule interact in the Hamiltonian Engine following the discretized Hamiltonian equations. These implicit coordinations are supervised with real conformations with translation- &amp; rotation-invariant losses, and further used as inputs to the Fingerprint Generator, a message-passing neural network. Experiments show that the Hamiltonian Engine can well preserve molecular conformations, and that the fingerprints generated by HamNet achieve state-of-the-art performances on MoleculeNet, a standard molecular machine learning benchmark.",
+    "title": "Conformation-Guided Molecular Representation with Hamiltonian Neural Networks",
+    "authors": [
+      "Ziyao Li",
+      "Shuwen Yang",
+      "Guojie Song",
+      "Lingsheng Cai"
+    ],
+    "emails": [
+      "~Guojie_Song1",
+      "~Ziyao_Li1",
+      "pku.edu.cn"
+    ],
+    "rank": 564
+  },
+  {
+    "url": "https://openreview.net/forum?id=0O_cQfw6uEh",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper proposes a new type of generative model that is able to quickly learn a latent representation without an encoder. This is achieved using empirical Bayes to calculate the expectation of the posterior, which is implemented by initialising a latent vector with zeros, then using the gradient of the log-likelihood of the data with respect to this zero vector as new latent points. The approach has similar characteristics to autoencoders, but with a simpler architecture, and is demonstrated in a variational autoencoder equivalent that permits sampling. This also allows implicit representation networks to learn a space of implicit functions without requiring a hypernetwork, retaining their representation advantages across datasets. The experiments show that the proposed method converges faster, with significantly lower reconstruction error than autoencoders, while requiring half the parameters.",
+    "title": "Gradient Origin Networks",
+    "authors": [
+      "Sam Bond-Taylor",
+      "Chris G. Willcocks"
+    ],
+    "emails": [
+      "durham.ac.uk",
+      "~Sam_Bond-Taylor1"
+    ],
+    "rank": 565
+  },
+  {
+    "url": "https://openreview.net/forum?id=vLaHRtHvfFp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "A recent line of work in the machine learning community addresses the problem of predicting high-dimensional spatiotemporal phenomena by leveraging specific tools from the differential equations theory. Following this direction, we propose in this article a novel and general paradigm for this task based on a resolution method for partial differential equations: the separation of variables. This inspiration allows us to introduce a dynamical interpretation of spatiotemporal disentanglement. It induces a principled model based on learning disentangled spatial and temporal representations of a phenomenon to accurately predict future observations. We experimentally demonstrate the performance and broad applicability of our method against prior state-of-the-art models on physical and synthetic video datasets.",
+    "title": "PDE-Driven Spatiotemporal Disentanglement",
+    "authors": [
+      "J\u00e9r\u00e9mie Don\u00e0",
+      "Jean-Yves Franceschi",
+      "sylvain lamprier",
+      "patrick gallinari"
+    ],
+    "emails": [
+      "~Jean-Yves_Franceschi1",
+      "~patrick_gallinari1",
+      "lip6.fr",
+      "~sylvain_lamprier1"
+    ],
+    "rank": 566
+  },
+  {
+    "url": "https://openreview.net/forum?id=4cC0HFuVd2d",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      2,
+      4,
+      3
+    ],
+    "abstract": "Saliency methods can make deep neural network predictions more interpretable by identifying a set of critical features in an input sample, such as pixels that contribute most strongly to a prediction made by an image classifier. Unfortunately, recent evidence suggests that many saliency methods poorly perform, especially in situations where gradients are saturated, inputs contain adversarial perturbations, or predictions rely upon inter-feature dependence. To address these issues, we propose a framework that improves the robustness of saliency methods by following a two-step procedure. First, we introduce a perturbation mechanism that subtly varies the input sample without changing its intermediate representations. Using this approach, we can gather a corpus of perturbed data samples while ensuring that the perturbed and original input samples follow the same distribution. Second, we compute saliency maps for the perturbed samples and propose a new method to aggregate saliency maps. With this design, we offset the gradient saturation influence upon interpretation. From a theoretical perspective, we show that the aggregated saliency map not only captures inter-feature dependence but, more importantly, is robust against previously described adversarial perturbation methods. Following our theoretical analysis, we present experimental results suggesting that, both qualitatively and quantitatively, our saliency method outperforms existing methods, in a variety of applications.",
+    "title": "Decoy-enhanced Saliency Maps ",
+    "authors": [
+      "Yang Young Lu",
+      "Wenbo Guo",
+      "Xinyu Xing",
+      "William Noble"
+    ],
+    "emails": [
+      "~Wenbo_Guo1",
+      "~William_Noble1",
+      "~Yang_Young_Lu1",
+      "~Xinyu_Xing1"
+    ],
+    "rank": 567
+  },
+  {
+    "url": "https://openreview.net/forum?id=ixpSxO9flk3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      4
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Energy-Based Models (EBMs) present a flexible and appealing way to represent uncertainty. Despite recent advances, training EBMs on high-dimensional data remains a challenging problem as the state-of-the-art approaches are costly, unstable, and require considerable tuning and domain expertise to apply successfully. In this work, we present a simple method for training EBMs at scale which uses an entropy-regularized generator to amortize the MCMC sampling typically used in EBM training. We improve upon prior MCMC-based entropy regularization methods with a fast variational approximation. We demonstrate the effectiveness of our approach by using it to train tractable likelihood models. Next, we apply our estimator to the recently proposed Joint Energy Model (JEM), where we match the original performance with faster and stable training. This allows us to extend JEM models to semi-supervised classification on tabular data from a variety of continuous domains.",
+    "title": "No MCMC for me: Amortized sampling for fast and stable training of energy-based models",
+    "authors": [
+      "Will Sussman Grathwohl",
+      "Jacob Jin Kelly",
+      "Milad Hashemi",
+      "Mohammad Norouzi",
+      "Kevin Swersky",
+      "David Duvenaud"
+    ],
+    "emails": [
+      "~Mohammad_Norouzi1",
+      "~Milad_Hashemi1",
+      "~Will_Sussman_Grathwohl2",
+      "gmail.com",
+      "~David_Duvenaud2",
+      "~Kevin_Swersky1"
+    ],
+    "rank": 568
+  },
+  {
+    "url": "https://openreview.net/forum?id=9r30XCjf5Dt",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Poisoning attacks on Reinforcement Learning (RL) systems could take advantage of RL algorithm\u2019s vulnerabilities and cause failure of the learning. However, prior works on poisoning RL usually either unrealistically assume the attacker knows the underlying Markov Decision Process (MDP), or directly apply the poisoning methods in supervised learning to RL. In this work, we build a generic poisoning framework for online RL via a comprehensive investigation of heterogeneous poisoning models in RL. Without any prior knowledge of the MDP, we propose a strategic poisoning algorithm called Vulnerability-Aware Adversarial Critic Poison (VA2C-P), which works for on-policy deep RL agents, closing the gap that no poisoning method exists for policy-based RL agents. VA2C-P uses a novel metric, stability radius in RL, that measures the vulnerability of RL algorithms. Experiments on multiple deep RL agents and multiple environments show that our poisoning algorithm successfully prevents agents from learning a good policy or teaches the agents to converge to a target policy, with a limited attacking budget.",
+    "title": "Vulnerability-Aware Poisoning Mechanism for Online RL with Unknown Dynamics",
+    "authors": [
+      "Yanchao Sun",
+      "Da Huo",
+      "Furong Huang"
+    ],
+    "emails": [
+      "~Yanchao_Sun1",
+      "~Da_Huo1",
+      "~Furong_Huang1"
+    ],
+    "rank": 569
+  },
+  {
+    "url": "https://openreview.net/forum?id=xoHdgbQJohv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      9,
+      5,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present a new methodology for detecting out-of-distribution (OOD) images by utilizing norms of the score estimates at multiple noise scales. A score is defined to be the gradient of the log density with respect to the input data. Our methodology is completely unsupervised and follows a straight forward training scheme. First, we train a deep network to estimate scores for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> levels of noise. Once trained, we calculate the noisy score estimates for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> in-distribution samples and take the L2-norms across the input dimensions (resulting in an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container>x<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> matrix). Then we train an auxiliary model (such as a Gaussian Mixture Model) to learn the in-distribution spatial regions in this <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container>-dimensional space. This auxiliary model can now be used to identify points that reside outside the learned space. Despite its simplicity, our experiments show that this methodology significantly outperforms the state-of-the-art in detecting out-of-distribution images. For example, our method can effectively separate CIFAR-10 (inlier) and SVHN (OOD) images, a setting which has been previously shown to be difficult for deep likelihood models.",
+    "title": "Multiscale Score Matching for Out-of-Distribution Detection",
+    "authors": [
+      "Ahsan Mahmood",
+      "Junier Oliva",
+      "Martin Andreas Styner"
+    ],
+    "emails": [
+      "~Junier_Oliva1",
+      "~Ahsan_Mahmood1",
+      "~Martin_Andreas_Styner1"
+    ],
+    "rank": 570
+  },
+  {
+    "url": "https://openreview.net/forum?id=W1G1JZEIy5_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Neural text decoding algorithms strongly influence the quality of texts generated using language models, but popular algorithms like top-k, top-p (nucleus), and temperature-based sampling may yield texts that have objectionable repetition or incoherence. Although these methods generate high-quality text after ad hoc parameter tuning that depends on the language model and the length of generated text, not much is known about the control they provide over the statistics of the output. This is important, however, since recent reports show that humans prefer when perplexity is neither too much nor too little and since we experimentally show that cross-entropy (log of perplexity) has a near-linear relation with repetition. First, we provide a theoretical analysis of perplexity in top-k, top-p, and temperature sampling, under Zipfian statistics. Then, we use this analysis to design a feedback-based adaptive top-k text decoding algorithm called mirostat that generates text (of any length) with a predetermined target value of perplexity without any tuning. Experiments show that for low values of k and p, perplexity drops significantly with generated text length and leads to excessive repetitions (the boredom trap). Contrarily, for large values of k and p, perplexity increases with generated text length and leads to incoherence (confusion trap). Mirostat avoids both traps. Specifically, we show that setting target perplexity value beyond a threshold yields negligible sentence-level repetitions. Experiments with\nhuman raters for fluency, coherence, and quality further verify our findings.",
+    "title": "MIROSTAT: A NEURAL TEXT DECODING ALGORITHM THAT DIRECTLY CONTROLS PERPLEXITY",
+    "authors": [
+      "Sourya Basu",
+      "Govardana Sachitanandam Ramachandran",
+      "Nitish Shirish Keskar",
+      "Lav R. Varshney"
+    ],
+    "emails": [
+      "~Lav_R._Varshney1",
+      "~Sourya_Basu1",
+      "salesforce.com",
+      "~Nitish_Shirish_Keskar1"
+    ],
+    "rank": 571
+  },
+  {
+    "url": "https://openreview.net/forum?id=dOcQK-f4byz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study two fundamental questions in neuro-symbolic computing: can deep learning tackle challenging problems in logics end-to-end, and can neural networks learn the semantics of logics. In this work we focus on linear-time temporal logic (LTL), as it is widely used in verification. We train a Transformer on the problem to directly predict a solution, i.e. a trace, to a given LTL formula. The training data is generated with classical solvers, which, however, only provide one of many possible solutions to each formula. We demonstrate that it is sufficient to train on those particular solutions to formulas, and that Transformers can predict solutions even to formulas from benchmarks from the literature on which the classical solver timed out. Transformers also generalize to the semantics of the logics: while they often deviate from the solutions found by the classical solvers, they still predict correct solutions to most formulas.",
+    "title": "Teaching Temporal Logics to Neural Networks",
+    "authors": [
+      "Christopher Hahn",
+      "Frederik Schmitt",
+      "Jens U. Kreber",
+      "Markus Norman Rabe",
+      "Bernd Finkbeiner"
+    ],
+    "emails": [
+      "~Jens_U._Kreber1",
+      "~Markus_Norman_Rabe1",
+      "cispa.saarland",
+      "~Christopher_Hahn1",
+      "~Frederik_Schmitt1"
+    ],
+    "rank": 572
+  },
+  {
+    "url": "https://openreview.net/forum?id=tyd9yxioXgO",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Videos of actions are complex signals, containing rich compositional structure. Current video generation models are limited in their ability to generate such videos. To address this challenge, we introduce a generative model (AG2Vid) that can be conditioned on an Action Graph, a structure that naturally represents the dynamics of actions and interactions between objects. Our AG2Vid model disentangles appearance and position features, allowing for more accurate generation. AG2Vid is evaluated on the CATER and Something-Something datasets and outperforms other baselines. Finally, we show how Action Graphs can be used for generating novel compositions of actions. ",
+    "title": "Compositional Video Synthesis with Action Graphs",
+    "authors": [
+      "Amir Bar",
+      "Roei Herzig",
+      "Xiaolong Wang",
+      "Gal Chechik",
+      "Trevor Darrell",
+      "Amir Globerson"
+    ],
+    "emails": [
+      "~Gal_Chechik1",
+      "~Amir_Bar1",
+      "~Amir_Globerson1",
+      "~Trevor_Darrell2",
+      "~Xiaolong_Wang3",
+      "~Roei_Herzig2"
+    ],
+    "rank": 573
+  },
+  {
+    "url": "https://openreview.net/forum?id=h0de3QWtGG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      2,
+      3,
+      4
+    ],
+    "abstract": "Building interpretable parameterizations of real-world decision-making on the basis of demonstrated behavior--i.e. trajectories of observations and actions made by an expert maximizing some unknown reward function--is essential for introspecting and auditing policies in different institutions. In this paper, we propose learning explanations of expert decisions by modeling their reward function in terms of preferences with respect to ``\"what if'' outcomes: Given the current history of observations, what would happen if we took a particular action? To learn these cost-benefit tradeoffs associated with the expert's actions, we integrate counterfactual reasoning into batch inverse reinforcement learning. This offers a principled way of defining reward functions and explaining expert behavior, and also satisfies the constraints of real-world decision-making---where active experimentation is often impossible (e.g. in healthcare). Additionally, by estimating the effects of different actions, counterfactuals readily tackle the off-policy nature of policy evaluation in the batch setting, and can naturally accommodate settings where the expert policies depend on histories of observations rather than just current states. Through illustrative experiments in both real and simulated medical environments, we highlight the effectiveness of our batch, counterfactual inverse reinforcement learning approach in recovering accurate and interpretable descriptions of behavior.",
+    "title": "Learning \"What-if\" Explanations for Sequential Decision-Making",
+    "authors": [
+      "Ioana Bica",
+      "Daniel Jarrett",
+      "Alihan H\u00fcy\u00fck",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Daniel_Jarrett1",
+      "~Ioana_Bica1",
+      "cam.ac.uk",
+      "~Mihaela_van_der_Schaar2"
+    ],
+    "rank": 574
+  },
+  {
+    "url": "https://openreview.net/forum?id=OodqmQT3fir",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Value Iteration Networks (VINs) have emerged as a popular method to perform implicit planning within deep reinforcement learning, enabling performance improvements on tasks requiring long-range reasoning and understanding of environment dynamics. This came with several limitations, however: the model is not explicitly incentivised to perform meaningful planning computations, the underlying state space is assumed to be discrete, and the Markov decision process (MDP) is assumed fixed and known. We propose eXecuted Latent Value Iteration Networks (XLVINs), which combine recent developments across contrastive self-supervised learning, graph representation learning and neural algorithmic reasoning to alleviate all of the above limitations, successfully deploying VIN-style models on generic environments. XLVINs match the performance of VIN-like models when the underlying MDP is discrete, fixed and known, and provide significant improvements to model-free baselines across three general MDP setups.",
+    "title": "XLVIN: eXecuted Latent Value Iteration Nets",
+    "authors": [
+      "Andreea Deac",
+      "Petar Veli\u010dkovi\u0107",
+      "Ognjen Milinkovic",
+      "Pierre-Luc Bacon",
+      "Jian Tang",
+      "Mladen Nikolic"
+    ],
+    "emails": [
+      "~Petar_Veli\u010dkovi\u01071",
+      "~Andreea_Deac1",
+      "gmail.com",
+      "~Jian_Tang1",
+      "~Pierre-Luc_Bacon1"
+    ],
+    "rank": 575
+  },
+  {
+    "url": "https://openreview.net/forum?id=VbLH04pRA3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "We study the problem of using low cost to search for hyperparameter configurations in a large search space with heterogeneous evaluation cost and model quality.\nWe propose a blended search strategy to combine the strengths of global and local search, and prioritize them on the fly with the goal of minimizing the total cost spent in finding good configurations. Our approach demonstrates robust performance for tuning both tree-based models and deep neural networks on a large AutoML benchmark, as well as superior performance in model quality, time, and resource consumption for a production transformer-based NLP model fine-tuning task.",
+    "title": "ECONOMIC HYPERPARAMETER OPTIMIZATION WITH BLENDED SEARCH STRATEGY",
+    "authors": [
+      "Chi Wang",
+      "Qingyun Wu",
+      "Silu Huang",
+      "Amin Saied"
+    ],
+    "emails": [
+      "~Chi_Wang3",
+      "microsoft.com"
+    ],
+    "rank": 576
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ogga20D2HO-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Federated learning (FL) allows edge devices to collectively learn a model without directly sharing data within each device, thus preserving privacy and eliminating the need to store data globally. While there are promising results under the assumption of independent and identically distributed (iid) local data, current state-of-the-art algorithms suffer a performance degradation as the heterogeneity of local data across clients increases. To resolve this issue, we propose a simple framework, \\emph{Mean Augmented Federated Learning (MAFL)}, where clients send and receive \\emph{averaged} local data, subject to the privacy requirements of target applications. Under our framework, we propose a new augmentation algorithm, named \\emph{FedMix}, which is inspired by a phenomenal yet simple data augmentation method, Mixup, but does not require local raw data to be directly shared among devices. Our method shows greatly improved performance in the standard benchmark datasets of FL, under highly non-iid federated settings, compared to conventional algorithms.",
+    "title": "FedMix: Approximation of Mixup under Mean Augmented Federated Learning",
+    "authors": [
+      "Tehrim Yoon",
+      "Sumin Shin",
+      "Sung Ju Hwang",
+      "Eunho Yang"
+    ],
+    "emails": [
+      "~Eunho_Yang1",
+      "~Tehrim_Yoon1",
+      "kaist.ac.kr",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 577
+  },
+  {
+    "url": "https://openreview.net/forum?id=3X64RLgzY6O",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Understanding the algorithmic bias of stochastic gradient descent (SGD) is one of the key challenges in modern machine learning and deep learning theory. Most of the existing works, however, focus on very small or even infinitesimal learning rate regime, and fail to cover practical scenarios where the learning rate is moderate and annealing. In this paper, we make an initial attempt to characterize the particular regularization effect of SGD in the moderate learning rate regime by studying its behavior for optimizing an overparameterized linear regression problem. In this case, SGD and GD are known to converge to the unique minimum-norm solution; however, with the moderate and annealing learning rate, we show that they exhibit different directional bias: SGD converges along the large eigenvalue directions of the data matrix, while GD goes after the small eigenvalue directions. Furthermore, we show that such directional bias does matter when early stopping is adopted, where the SGD output is nearly optimal but the GD output is suboptimal. Finally, our theory explains several folk arts in practice used for SGD hyperparameter tuning, such as (1) linearly scaling the initial learning rate with batch size; and (2) overrunning SGD with high learning rate even when the loss stops decreasing.",
+    "title": "Direction Matters: On the Implicit Bias of Stochastic Gradient Descent with Moderate Learning Rate",
+    "authors": [
+      "Jingfeng Wu",
+      "Difan Zou",
+      "Vladimir Braverman",
+      "Quanquan Gu"
+    ],
+    "emails": [
+      "~Jingfeng_Wu1",
+      "~Quanquan_Gu1",
+      "~Vladimir_Braverman1",
+      "~Difan_Zou1"
+    ],
+    "rank": 578
+  },
+  {
+    "url": "https://openreview.net/forum?id=hWr3e3r-oH5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      2,
+      5,
+      5
+    ],
+    "abstract": "Temporally localizing actions in videos is one of the key components for video understanding. Learning from weakly-labeled data is seen as a potential solution towards avoiding expensive frame-level annotations. Different from other works which only depend on visual-modality, we propose to learn richer audiovisual representation for weakly-supervised action localization. First, we propose a multi-stage cross-attention mechanism to collaboratively fuse audio and visual features, which preserves the intra-modal characteristics. Second, to model both foreground and background frames, we construct an open-max classifier that treats the background class as an open-set. Third, for precise action localization, we design consistency losses to enforce temporal continuity for the action class prediction, and also help with foreground-prediction reliability. Extensive experiments on two publicly available video-datasets (AVE and ActivityNet1.2) show that the proposed method effectively fuses audio and visual modalities, and achieves the state-of-the-art results for weakly-supervised action localization.",
+    "title": "Cross-Attentional Audio-Visual Fusion for Weakly-Supervised Action Localization",
+    "authors": [
+      "Jun-Tae Lee",
+      "Mihir Jain",
+      "Hyoungwoo Park",
+      "Sungrack Yun"
+    ],
+    "emails": [
+      "~Jun-Tae_Lee1",
+      "qti.qualcomm.com",
+      "~Mihir_Jain1"
+    ],
+    "rank": 579
+  },
+  {
+    "url": "https://openreview.net/forum?id=0pxiMpCyBtr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      1,
+      3,
+      2
+    ],
+    "abstract": "It is computationally challenging to learn flexible monotonic functions that guarantee model behavior and provide interpretability beyond a few input features, and in a time where minimizing resource use is increasingly important, we must be able to learn such models that are still efficient. In this paper we show how to effectively and efficiently learn such functions using Kronecker-Factored Lattice (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4B\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c46\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"normal\">K</mi><mi mathvariant=\"normal\">F</mi><mi mathvariant=\"normal\">L</mi></mrow></math></mjx-assistive-mml></mjx-container>), an efficient reparameterization of flexible monotonic lattice regression via Kronecker product. Both computational and storage costs scale linearly in the number of input features, which is a significant improvement over existing methods that grow exponentially. We also show that we can still properly enforce monotonicity and other shape constraints. The <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4B\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c46\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"normal\">K</mi><mi mathvariant=\"normal\">F</mi><mi mathvariant=\"normal\">L</mi></mrow></math></mjx-assistive-mml></mjx-container> function class consists of products of piecewise-linear functions, and the size of the function class can be further increased through ensembling. We prove that the function class of an ensemble of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> base <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4B\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c46\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"normal\">K</mi><mi mathvariant=\"normal\">F</mi><mi mathvariant=\"normal\">L</mi></mrow></math></mjx-assistive-mml></mjx-container> models strictly increases as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> increases up to a certain threshold. Beyond this threshold, every multilinear interpolated lattice function can be expressed. Our experimental results demonstrate that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4B\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c46\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"normal\">K</mi><mi mathvariant=\"normal\">F</mi><mi mathvariant=\"normal\">L</mi></mrow></math></mjx-assistive-mml></mjx-container> trains faster with fewer parameters while still achieving accuracy and evaluation speeds comparable to or better than the baseline methods and preserving monotonicity guarantees on the learned model.",
+    "title": "Monotonic Kronecker-Factored Lattice",
+    "authors": [
+      "William Taylor Bakst",
+      "Nobuyuki Morioka",
+      "Erez Louidor"
+    ],
+    "emails": [
+      "~Erez_Louidor1",
+      "~Nobuyuki_Morioka1",
+      "~William_Taylor_Bakst1"
+    ],
+    "rank": 580
+  },
+  {
+    "url": "https://openreview.net/forum?id=hiq1rHO8pNT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Achieving state-of-the-art performance on natural language understanding tasks typically relies on fine-tuning a fresh model for every task. Consequently, this approach leads to a higher overall parameter cost, along with higher technical maintenance for serving multiple models. Learning a single multi-task model that is able to do well for all the tasks has been a challenging and yet attractive proposition. In this paper, we propose HyperGrid Transformers, a new Transformer architecture that leverages task-conditioned hyper networks for controlling its feed-forward layers. Specifically, we propose a decomposable hypernetwork that learns grid-wise projections that help to specialize regions in weight matrices for different tasks. In order to construct the proposed hypernetwork, our method learns the interactions and composition between a global (task-agnostic) state and a local task-specific state. We conduct an extensive set of experiments on GLUE/SuperGLUE. On the SuperGLUE test set, we match the performance of the state-of-the-art while being <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>16</mn></math></mjx-assistive-mml></mjx-container> times more parameter efficient. Our method helps bridge the gap between fine-tuning and multi-task learning approaches.",
+    "title": "HyperGrid Transformers: Towards A Single Model for Multiple Tasks",
+    "authors": [
+      "Yi Tay",
+      "Zhe Zhao",
+      "Dara Bahri",
+      "Donald Metzler",
+      "Da-Cheng Juan"
+    ],
+    "emails": [
+      "~Dara_Bahri1",
+      "~Zhe_Zhao3",
+      "google.com",
+      "~Yi_Tay1",
+      "~Da-Cheng_Juan1"
+    ],
+    "rank": 581
+  },
+  {
+    "url": "https://openreview.net/forum?id=9xC2tWEwBD",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      6,
+      3,
+      8
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Recent increases in the computational demands of deep neural networks (DNNs), combined with the observation that most input samples require only simple models, have sparked interest in input-adaptive multi-exit architectures, such as MSDNets or Shallow-Deep Networks. These architectures enable faster inferences and could bring DNNs to low-power devices, e.g., in the Internet of Things (IoT). However, it is unknown if the computational savings provided by this approach are robust against adversarial pressure. In particular, an adversary may aim to slowdown adaptive DNNs by increasing their average inference time\u2014a threat analogous to the denial-of-service attacks from the Internet. In this paper, we conduct a systematic evaluation of this threat by experimenting with three generic multi-exit DNNs (based on VGG16, MobileNet, and ResNet56) and a custom multi-exit architecture, on two popular image classification benchmarks (CIFAR-10 and Tiny ImageNet). To this end, we show that adversarial example-crafting techniques can be modified to cause slowdown, and we propose a metric for comparing their impact on different architectures. We show that a slowdown attack reduces the efficacy of multi-exit DNNs by 90\u2013100%, and it amplifies the latency by 1.5\u20135\u00d7 in a typical IoT deployment. We also show that it is possible to craft universal, reusable perturbations and that the attack can be effective in realistic black-box scenarios, where the attacker has limited knowledge about the victim. Finally, we show that adversarial training provides limited protection against slowdowns. These results suggest that further research is needed for defending multi-exit architectures against this emerging threat. Our code is available at <a href=\"https://github.com/sanghyun-hong/deepsloth\" target=\"_blank\" rel=\"nofollow\">https://github.com/sanghyun-hong/deepsloth</a>. ",
+    "title": "A Panda? No, It's a Sloth: Slowdown Attacks on Adaptive Multi-Exit Neural Network Inference",
+    "authors": [
+      "Sanghyun Hong",
+      "Yigitcan Kaya",
+      "Ionu\u021b-Vlad Modoranu",
+      "Tudor Dumitras"
+    ],
+    "emails": [
+      "hotmail.com",
+      "~Yigitcan_Kaya1",
+      "~Tudor_Dumitras1",
+      "~Sanghyun_Hong1"
+    ],
+    "rank": 582
+  },
+  {
+    "url": "https://openreview.net/forum?id=hx1IXFHAw7R",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      2,
+      3,
+      4
+    ],
+    "abstract": "We propose a novel setting for reinforcement learning that combines two common real-world difficulties: presence of observations (such as camera images) and factored states (such as location of objects). In our setting, the agent receives observations generated stochastically from a \"latent\" factored state. These observations are \"rich enough\" to enable decoding of the latent state and remove partial observability concerns. Since the latent state is combinatorial, the size of state space is exponential in the number of latent factors. We create a learning algorithm FactoRL (Fact-o-Rel) for this setting,  which uses noise-contrastive learning to identify latent structures in emission processes and discover a factorized state space. We derive polynomial sample complexity guarantees for FactoRL which polynomially depend upon the number factors, and very weakly depend on the size of the observation space.  We also provide a guarantee of polynomial time complexity when given access to an efficient planning algorithm.",
+    "title": "Provable Rich Observation Reinforcement Learning with Combinatorial Latent States",
+    "authors": [
+      "Dipendra Misra",
+      "Qinghua Liu",
+      "Chi Jin",
+      "John Langford"
+    ],
+    "emails": [
+      "microsoft.com",
+      "~Chi_Jin1",
+      "~Qinghua_Liu1",
+      "~Dipendra_Misra1"
+    ],
+    "rank": 583
+  },
+  {
+    "url": "https://openreview.net/forum?id=cYek5NoXNiX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      8
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Modern policy gradient algorithms, notably Proximal Policy Optimization (PPO), rely on an arsenal of heuristics, including loss clipping and gradient clipping, to ensure successful learning. These heuristics are reminiscent of techniques from robust statistics, commonly used for estimation in outlier-rich (\"heavy-tailed\") regimes. In this paper, we present a detailed empirical study to characterize the heavy-tailed nature of the gradients of the PPO surrogate reward function. \nWe demonstrate pronounced heavy-tailedness of the gradients, specifically for the actor network,  which increases as the current policy diverges from the behavioral one (i.e., as the agent goes further off policy).  Further examination implicates the likelihood ratios and advantages in the surrogate reward as the main sources to the observed heavy-tailedness. Subsequently, we study the effects of the standard PPO clipping heuristics, demonstrating how these tricks primarily serve to offset heavy-tailedness in gradients. Motivated by these connections, we propose incorporating GMOM (a high-dimensional robust estimator) into PPO as a substitute for three clipping tricks, achieving performance close to PPO (with all heuristics enabled) on a battery of MuJoCo continuous control tasks. ",
+    "title": "On Proximal Policy Optimization's Heavy-Tailed Gradients ",
+    "authors": [
+      "Saurabh Garg",
+      "Joshua Zhanson",
+      "Emilio Parisotto",
+      "Adarsh Prasad",
+      "J Zico Kolter",
+      "Zachary Chase Lipton",
+      "Sivaraman Balakrishnan",
+      "Ruslan Salakhutdinov",
+      "Pradeep Kumar Ravikumar"
+    ],
+    "emails": [
+      "andrew.cmu.edu",
+      "~Sivaraman_Balakrishnan2",
+      "~Adarsh_Prasad1",
+      "~Pradeep_Kumar_Ravikumar1",
+      "~J_Zico_Kolter1",
+      "~Emilio_Parisotto1",
+      "~Zachary_Chase_Lipton1",
+      "~Ruslan_Salakhutdinov1",
+      "~Saurabh_Garg3"
+    ],
+    "rank": 584
+  },
+  {
+    "url": "https://openreview.net/forum?id=D_KeYoqCYC",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Dimensionality reduction methods for count data are critical to a wide range of applications in medical informatics and other fields where model interpretability is paramount. For such data, hierarchical Poisson matrix factorization (HPF) and other sparse probabilistic non-negative matrix factorization (NMF) methods are considered to be interpretable generative models. They consist of sparse transformations for decoding their learned representations into predictions. However, sparsity in representation decoding does not necessarily imply sparsity in the encoding of representations from the original data features.  HPF is often incorrectly interpreted in the literature as if it possesses encoder sparsity. The distinction between decoder sparsity and encoder sparsity is subtle but important. Due to the lack of encoder sparsity, HPF does not possess the column-clustering property of classical NMF -- the factor loading matrix does not sufficiently define how each factor is formed from the original features. We address this deficiency by self-consistently enforcing encoder sparsity, using a generalized additive model  (GAM), thereby allowing one to relate each representation coordinate to a subset of the original data features. In doing so, the method also gains the ability to perform feature selection. We demonstrate our method on simulated data and give an example of how encoder sparsity is of practical use in a concrete application of representing inpatient comorbidities in Medicare patients.",
+    "title": "Sparse encoding for more-interpretable feature-selecting representations in probabilistic matrix factorization",
+    "authors": [
+      "Joshua C Chang",
+      "Patrick Fletcher",
+      "Jungmin Han",
+      "Ted L Chang",
+      "Shashaank Vattikuti",
+      "Bart Desmet",
+      "Ayah Zirikly",
+      "Carson C Chow"
+    ],
+    "emails": [
+      "mederrata.com",
+      "gmail.com",
+      "~Joshua_C_Chang1",
+      "niddk.nih.gov"
+    ],
+    "rank": 585
+  },
+  {
+    "url": "https://openreview.net/forum?id=E3Ys6a1NTGT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study worst-case guarantees on the expected return of fixed-dataset policy optimization algorithms. Our core contribution is a unified conceptual and mathematical framework for the study of algorithms in this regime. This analysis reveals that for naive approaches, the possibility of erroneous value overestimation leads to a difficult-to-satisfy requirement: in order to guarantee that we select a policy which is near-optimal, we may need the dataset to be informative of the value of every policy. To avoid this, algorithms can follow the pessimism principle, which states that we should choose the policy which acts optimally in the worst possible world. We show why pessimistic algorithms can achieve good performance even when the dataset is not informative of every policy, and derive families of algorithms which follow this principle. These theoretical findings are validated by experiments on a tabular gridworld, and deep learning experiments on four MinAtar environments.",
+    "title": "The Importance of Pessimism in Fixed-Dataset Policy Optimization",
+    "authors": [
+      "Jacob Buckman",
+      "Carles Gelada",
+      "Marc G Bellemare"
+    ],
+    "emails": [
+      "openai.com",
+      "~Jacob_Buckman2",
+      "~Marc_G_Bellemare1"
+    ],
+    "rank": 586
+  },
+  {
+    "url": "https://openreview.net/forum?id=7dpmlkBuJFC",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Differentially private SGD (DP-SGD) is one of the most popular methods for solving differentially private empirical risk minimization (ERM). Due to its noisy perturbation on each gradient update, the error rate of DP-SGD scales with the ambient dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container>, the number of parameters in the model. Such dependence can be problematic for over-parameterized models where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo>\u226b</mo><mi>n</mi></math></mjx-assistive-mml></mjx-container>, the number of training samples. Existing lower bounds on private ERM show that such dependence on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container> is inevitable in the worst case. In this paper, we circumvent the dependence on the ambient dimension by leveraging a low-dimensional structure of gradient space in deep networks---that is, the stochastic gradients for deep nets usually stay in a low dimensional subspace in the training process. We propose Projected DP-SGD that performs noise reduction by projecting the noisy gradients to a low-dimensional subspace, which is given by the top gradient eigenspace on a small public dataset. We provide a general sample complexity analysis on the public dataset for the gradient subspace identification problem and demonstrate that under certain low-dimensional assumptions the public sample complexity only grows logarithmically in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container>. Finally, we provide a theoretical analysis and empirical evaluations to show that our method can substantially improve the accuracy of DP-SGD in the high privacy regime (corresponding to low privacy loss <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>).\n\n",
+    "title": "Bypassing the Ambient Dimension: Private SGD with Gradient Subspace Identification",
+    "authors": [
+      "Yingxue Zhou",
+      "Steven Wu",
+      "Arindam Banerjee"
+    ],
+    "emails": [
+      "~Steven_Wu1",
+      "~Arindam_Banerjee1",
+      "~Yingxue_Zhou1"
+    ],
+    "rank": 587
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qr0aRliE_Hb",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Mixed precision quantization improves DNN performance by assigning different layers with different bit-width values. Searching for the optimal bit-width for each layer, however, remains a challenge. Deep Reinforcement Learning (DRL) shows some recent promise. It however suffers instability due to function approximation errors, causing large variances in the early training stages, slow convergence, and suboptimal policies in the mixed-precision quantization problem. This paper proposes augmented DRL (ADRL) as a way to alleviate these issues. This new strategy augments the neural networks in DRL with a complementary scheme to boost the performance of learning. The paper examines the effectiveness of ADRL both analytically and empirically, showing that it can produce more accurate quantized models than the state of the art DRL-based quantization while improving the learning speed by 4.5-64 times. ",
+    "title": "Simple Augmentation Goes a Long Way: ADRL for DNN Quantization",
+    "authors": [
+      "Lin Ning",
+      "Guoyang Chen",
+      "Weifeng Zhang",
+      "Xipeng Shen"
+    ],
+    "emails": [
+      "~Lin_Ning1",
+      "~Xipeng_Shen1",
+      "~Guoyang_Chen1",
+      "alibaba-inc.com"
+    ],
+    "rank": 588
+  },
+  {
+    "url": "https://openreview.net/forum?id=p5uylG94S68",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "We contribute to micro-data model-based reinforcement learning (MBRL) by rigorously comparing popular generative models using a fixed (random shooting) control agent. We find that on an environment that requires multimodal posterior predictives, mixture density nets outperform all other models by a large margin. When multimodality is not required, our surprising finding is that we do not need probabilistic posterior predictives: deterministic models are on par, in fact they consistently (although non-significantly) outperform their probabilistic counterparts. We also found that heteroscedasticity at training time, perhaps acting as a regularizer, improves predictions at longer horizons. At the methodological side, we design metrics and an experimental protocol which can be used to evaluate the various models, predicting their asymptotic performance when using them on the control problem. Using this framework, we improve the state-of-the-art sample complexity of MBRL on Acrobot by two to four folds, using an aggressive training schedule which is outside of the hyperparameter interval usually considered.",
+    "title": "Model-based micro-data reinforcement learning: what are the crucial model properties and which model to choose?",
+    "authors": [
+      "Bal\u00e1zs K\u00e9gl",
+      "Gabriel Hurtado",
+      "Albert Thomas"
+    ],
+    "emails": [
+      "~Bal\u00e1zs_K\u00e9gl2",
+      "~Albert_Thomas1",
+      "gmail.com"
+    ],
+    "rank": 589
+  },
+  {
+    "url": "https://openreview.net/forum?id=FZ1oTwcXchK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Spiking neural networks (SNNs) are biology-inspired artificial neural networks (ANNs) that comprise of spiking neurons to process asynchronous discrete signals. While more efficient in power consumption and inference speed on the neuromorphic hardware, SNNs are usually difficult to train directly from scratch with spikes due to the discreteness. As an alternative, many efforts have been devoted to converting conventional ANNs into SNNs by copying the weights from ANNs and adjusting the spiking threshold potential of neurons in SNNs. Researchers have designed new SNN architectures and conversion algorithms to diminish the conversion error. However, an effective conversion should address the difference between the SNN and ANN architectures with an efficient approximation of the loss function, which is missing in the field. In this work, we analyze the conversion error by recursive reduction to layer-wise summation and propose a novel strategic pipeline that transfers the weights to the target SNN by combining threshold balance and soft-reset mechanisms. This pipeline enables almost no accuracy loss between the converted SNNs and conventional ANNs with only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>10</mn></math></mjx-assistive-mml></mjx-container> of the typical SNN simulation time. Our method is promising to get implanted onto embedded platforms with better support of SNNs with limited energy and memory. Codes are available at <a href=\"https://github.com/Jackn0/snn_optimal_conversion_pipeline\" target=\"_blank\" rel=\"nofollow\">https://github.com/Jackn0/snn_optimal_conversion_pipeline</a>.",
+    "title": "Optimal Conversion of Conventional Artificial Neural Networks to Spiking Neural Networks",
+    "authors": [
+      "Shikuang Deng",
+      "Shi Gu"
+    ],
+    "emails": [
+      "~Shikuang_Deng1",
+      "~Shi_Gu1"
+    ],
+    "rank": 590
+  },
+  {
+    "url": "https://openreview.net/forum?id=GFsU8a0sGB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      2,
+      4,
+      3
+    ],
+    "abstract": "Federated learning is typically approached as an optimization problem, where the goal is to minimize a global loss function by distributing computation across client devices that possess local data and specify different parts of the global objective.  We present an alternative perspective and formulate federated learning as a posterior inference problem, where the goal is to infer a global posterior distribution by having client devices each infer the posterior of their local data.  While exact inference is often intractable, this perspective provides a principled way to search for global optima in federated settings.  Further, starting with the analysis of federated quadratic objectives, we develop a computation- and communication-efficient approximate posterior inference algorithm\u2014federated posterior averaging (FedPA).  Our algorithm uses MCMC for approximate inference of local posteriors on the clients and efficiently communicates their statistics to the server, where the latter uses them to refine a global estimate of the posterior mode.  Finally, we show that FedPA generalizes federated averaging (FedAvg), can similarly benefit from adaptive optimizers, and yields state-of-the-art results on four realistic and challenging benchmarks, converging faster, to better optima.",
+    "title": "Federated Learning via Posterior Averaging: A New Perspective and Practical Algorithms",
+    "authors": [
+      "Maruan Al-Shedivat",
+      "Jennifer Gillenwater",
+      "Eric Xing",
+      "Afshin Rostamizadeh"
+    ],
+    "emails": [
+      "~Eric_Xing1",
+      "~Afshin_Rostamizadeh1",
+      "~Maruan_Al-Shedivat1",
+      "~Jennifer_Gillenwater1"
+    ],
+    "rank": 591
+  },
+  {
+    "url": "https://openreview.net/forum?id=LiX3ECzDPHZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      7
+    ],
+    "rating": "6.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We aim to help users communicate their intent to machines using flexible, adaptive interfaces that translate arbitrary user input into desired actions. In this work, we focus on assistive typing applications in which a user cannot operate a keyboard, but can instead supply other inputs, such as webcam images that capture eye gaze or neural activity measured by a brain implant. Standard methods train a model on a fixed dataset of user inputs, then deploy a static interface that does not learn from its mistakes; in part, because extracting an error signal from user behavior can be challenging. We investigate a simple idea that would enable such interfaces to improve over time, with minimal additional effort from the user: online learning from user feedback on the accuracy of the interface's actions. In the typing domain, we leverage backspaces as feedback that the interface did not perform the desired action. We propose an algorithm called x-to-text (X2T) that trains a predictive model of this feedback signal, and uses this model to fine-tune any existing, default interface for translating user input into actions that select words or characters. We evaluate X2T through a small-scale online user study with 12 participants who type sentences by gazing at their desired words, a large-scale observational study on handwriting samples from 60 users, and a pilot study with one participant using an electrocorticography-based brain-computer interface. The results show that X2T learns to outperform a non-adaptive default interface, stimulates user co-adaptation to the interface, personalizes the interface to individual users, and can leverage offline data collected from the default interface to improve its initial performance and accelerate online learning.",
+    "title": "X2T: Training an X-to-Text Typing Interface with Online Learning from User Feedback",
+    "authors": [
+      "Jensen Gao",
+      "Siddharth Reddy",
+      "Glen Berseth",
+      "Nicholas Hardy",
+      "Nikhilesh Natraj",
+      "Karunesh Ganguly",
+      "Anca Dragan",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Siddharth_Reddy1",
+      "~Anca_Dragan1",
+      "ucsf.edu",
+      "~Glen_Berseth1",
+      "gmail.com",
+      "berkeley.edu",
+      "~Sergey_Levine1"
+    ],
+    "rank": 592
+  },
+  {
+    "url": "https://openreview.net/forum?id=yqPnIRhHtZv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.31",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Learning task-specific representations of persistence diagrams is an important problem in topological data analysis and machine learning. However, current state of the art methods are restricted in terms of their expressivity as they are focused on Euclidean representations. Persistence diagrams often contain features of infinite persistence (i.e., essential features) and Euclidean spaces shrink their importance relative to non-essential features because they cannot assign infinite distance to finite points. To deal with this issue, we propose a method to learn representations of persistence diagrams on hyperbolic spaces, more specifically on the Poincare ball. By representing features of infinite persistence infinitesimally close to the boundary of the ball, their distance to non-essential features approaches infinity, thereby their relative importance is preserved. This is achieved without utilizing extremely high values for the learnable parameters, thus the representation can be fed into downstream optimization methods and trained efficiently in an end-to-end fashion. We present experimental results on graph and image classification tasks and show that the performance of our method is on par with or exceeds the performance of other state of the art methods.\n",
+    "title": "Learning Hyperbolic Representations of Topological Features",
+    "authors": [
+      "Panagiotis Kyriakis",
+      "Iordanis Fostiropoulos",
+      "Paul Bogdan"
+    ],
+    "emails": [
+      "~Iordanis_Fostiropoulos1",
+      "~Paul_Bogdan1",
+      "~Panagiotis_Kyriakis1"
+    ],
+    "rank": 593
+  },
+  {
+    "url": "https://openreview.net/forum?id=Cz3dbFm5u-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.31",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "The advances in neural networks have driven many companies to provide prediction services to users in a wide range of applications. However, current prediction systems raise privacy concerns regarding the user's private data. A cryptographic neural network inference service is an efficient way to allow two parties to execute neural network inference without revealing either party\u2019s data or model. Nevertheless, existing cryptographic neural network inference services suffer from huge running latency; in particular, the latency of communication-expensive cryptographic activation function is 3 orders of magnitude higher than plaintext-domain activation function. And activations are the necessary components of the modern neural networks. Therefore, slow cryptographic activation has become the primary obstacle of efficient cryptographic inference. \n\nIn this paper, we propose a new technique, called SAFENet, to enable a Secure, Accurate and Fast nEural Network inference service. To speedup secure inference and guarantee inference accuracy, SAFENet includes channel-wise activation approximation with multiple-degree options. This is implemented by keeping the most useful activation channels and replacing the remaining, less useful, channels with various-degree polynomials. SAFENet also supports mixed-precision activation approximation by automatically assigning different replacement ratios to various layer; further increasing the approximation ratio and reducing inference latency. Our experimental results show SAFENet obtains the state-of-the-art inference latency and performance, reducing latency by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>38</mn><mi mathvariant=\"normal\">%</mi><mo>\u223c</mo><mn>61</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> or improving accuracy by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.8</mn><mi mathvariant=\"normal\">%</mi><mo>\u223c</mo><mn>4</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> over prior techniques on various encrypted datasets.",
+    "title": "SAFENet: A Secure, Accurate and Fast Neural Network Inference",
+    "authors": [
+      "Qian Lou",
+      "Yilin Shen",
+      "Hongxia Jin",
+      "Lei Jiang"
+    ],
+    "emails": [
+      "~Lei_Jiang1",
+      "~Qian_Lou1",
+      "~Yilin_Shen1",
+      "~Hongxia_Jin1"
+    ],
+    "rank": 594
+  },
+  {
+    "url": "https://openreview.net/forum?id=-bdp_8Itjwp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.31",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "As a subset of unsupervised representation learning, self-supervised representation learning adopts self-defined signals as supervision and uses the learned representation for downstream tasks, such as object detection and image captioning. Many proposed approaches for self-supervised learning follow naturally a multi-view perspective, where the input (e.g., original images) and the self-supervised signals (e.g., augmented images) can be seen as two redundant views of the data. Building from this multi-view perspective, this paper provides an information-theoretical framework to better understand the properties that encourage successful self-supervised learning. Specifically, we demonstrate that self-supervised learned representations can extract task-relevant information and discard task-irrelevant information. Our theoretical framework paves the way to a larger space of self-supervised learning objective design. In particular, we propose a composite objective that bridges the gap between prior contrastive and predictive learning objectives, and introduce an additional objective term to discard task-irrelevant information. To verify our analysis, we conduct controlled experiments to evaluate the impact of the composite objectives. We also explore our framework's empirical generalization beyond the multi-view perspective, where the cross-view redundancy may not be clearly observed.",
+    "title": "Self-supervised Learning from a Multi-view Perspective",
+    "authors": [
+      "Yao-Hung Hubert Tsai",
+      "Yue Wu",
+      "Ruslan Salakhutdinov",
+      "Louis-Philippe Morency"
+    ],
+    "emails": [
+      "~Louis-Philippe_Morency1",
+      "~Ruslan_Salakhutdinov1",
+      "~Yao-Hung_Hubert_Tsai1",
+      "~Yue_Wu17"
+    ],
+    "rank": 595
+  },
+  {
+    "url": "https://openreview.net/forum?id=C70cp4Cn32",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.31",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "We propose Multi-Level Local SGD, a distributed stochastic gradient method for learning a smooth, non-convex objective in a multi-level communication network with heterogeneous workers. Our network model consists of a set of disjoint sub-networks, with a single hub and multiple workers; further, workers may have different operating rates. The hubs exchange information with one another via a connected, but not necessarily complete communication network. In our algorithm, sub-networks execute a distributed SGD algorithm, using a hub-and-spoke paradigm, and the hubs periodically average their models with neighboring hubs. We first provide a unified mathematical framework that describes the Multi-Level Local SGD algorithm. We then present a theoretical analysis of the algorithm; our analysis shows the dependence of the convergence error on the worker node heterogeneity, hub network topology, and the number of local, sub-network, and global iterations. We illustrate the effectiveness of our algorithm in a multi-level network with slow workers via simulation-based experiments.",
+    "title": "Multi-Level Local SGD: Distributed SGD for Heterogeneous Hierarchical Networks",
+    "authors": [
+      "Timothy Castiglia",
+      "Anirban Das",
+      "Stacy Patterson"
+    ],
+    "emails": [
+      "rpi.edu",
+      "~Timothy_Castiglia1",
+      "~Stacy_Patterson1"
+    ],
+    "rank": 596
+  },
+  {
+    "url": "https://openreview.net/forum?id=6UdQLhqJyFD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      8,
+      5
+    ],
+    "rating": "6.31",
+    "confidences": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "abstract": "The recent success of Transformers in the language domain has motivated adapting it to a multimodal setting, where a new visual model is trained in tandem with an already pretrained language model. However, due to the excessive memory requirements from Transformers, existing work typically fixes the language model and train only the vision module, which limits its ability to learn cross-modal information in an end-to-end manner. In this work, we focus on reducing the parameters of multimodal Transformers in the context of audio-visual video representation learning. We alleviate the high memory requirement by sharing the parameters of Transformers across layers and modalities; we decompose the Transformer into modality-specific and modality-shared parts so that the model learns the dynamics of each modality both individually and together, and propose a novel parameter sharing scheme based on low-rank approximation. We show that our approach reduces parameters of the Transformers up to 97%, allowing us to train our model end-to-end from scratch. We also propose a negative sampling approach based on an instance similarity measured on the CNN embedding space that our model learns together with the Transformers. To demonstrate our approach, we pretrain our model on 30-second clips (480 frames) from Kinetics-700 and transfer it to audio-visual classification tasks.",
+    "title": "Parameter Efficient Multimodal Transformers for Video Representation Learning",
+    "authors": [
+      "Sangho Lee",
+      "Youngjae Yu",
+      "Gunhee Kim",
+      "Thomas Breuel",
+      "Jan Kautz",
+      "Yale Song"
+    ],
+    "emails": [
+      "~Sangho_Lee1",
+      "~Thomas_Breuel1",
+      "~Youngjae_Yu1",
+      "~Yale_Song1",
+      "~Jan_Kautz1",
+      "~Gunhee_Kim1"
+    ],
+    "rank": 597
+  },
+  {
+    "url": "https://openreview.net/forum?id=Xb8xvrtB8Ce",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.31",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Adversarial training (AT) is one of the most effective strategies for promoting model robustness. However, recent benchmarks show that most of the proposed improvements on AT are less effective than simply early stopping the training procedure. This counter-intuitive fact motivates us to investigate the implementation details of tens of AT methods. Surprisingly, we find that the basic settings (e.g., weight decay, training schedule, etc.) used in these methods are highly inconsistent. In this work, we provide comprehensive evaluations on CIFAR-10, focusing on the effects of mostly overlooked training tricks and hyperparameters for adversarially trained models. Our empirical observations suggest that adversarial robustness is much more sensitive to some basic training settings than we thought. For example, a slightly different value of weight decay can reduce the model robust accuracy by more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>7</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>, which is probable to override the potential promotion induced by the proposed methods. We conclude a baseline training setting and re-implement previous defenses to achieve new state-of-the-art results. These facts also appeal to more concerns on the overlooked confounders when benchmarking defenses.",
+    "title": "Bag of Tricks for Adversarial Training",
+    "authors": [
+      "Tianyu Pang",
+      "Xiao Yang",
+      "Yinpeng Dong",
+      "Hang Su",
+      "Jun Zhu"
+    ],
+    "emails": [
+      "~Yinpeng_Dong2",
+      "~Jun_Zhu2",
+      "~Tianyu_Pang1",
+      "~Xiao_Yang4",
+      "~Hang_Su3"
+    ],
+    "rank": 598
+  },
+  {
+    "url": "https://openreview.net/forum?id=r7L91opmsr",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      7,
+      5
+    ],
+    "rating": "6.31",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Effective variational inference crucially depends on a flexible variational family of distributions. Recent work has explored sequential Monte-Carlo (SMC) methods to construct variational distributions, which can, in principle, approximate the target posterior arbitrarily well, which is especially appealing for models with inherent sequential structure. However, SMC, which represents the posterior using a weighted set of particles, often suffers from particle weight degeneracy, leading to a large variance of the resulting estimators. To address this issue, we present a novel approach that leverages the idea of \\emph{partial} rejection control (PRC) for developing a robust variational inference (VI) framework. In addition to developing a superior VI bound, we propose a novel marginal likelihood estimator constructed via a dice-enterprise: a generalization of the Bernoulli factory to construct unbiased estimators for SMC-PRC. The resulting variational lower bound can be optimized efficiently with respect to the variational parameters and generalizes several existing approaches in the VI literature into a single framework. We show theoretical properties of the lower bound and report experiments on various sequential models, such as the Gaussian state-space model and variational RNN, on which our approach outperforms existing methods.",
+    "title": "Partial Rejection Control for Robust Variational Inference in Sequential Latent Variable Models",
+    "authors": [
+      "Rahul Sharma",
+      "Soumya Banerjee",
+      "Dootika Vats",
+      "Piyush Rai"
+    ],
+    "emails": [
+      "~Dootika_Vats1",
+      "~Rahul_Sharma1",
+      "~Soumya_Banerjee1",
+      "~Piyush_Rai1"
+    ],
+    "rank": 599
+  },
+  {
+    "url": "https://openreview.net/forum?id=9y4qOAIfA9r",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.31",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Neuroscientists evaluate deep neural networks for natural language processing as possible candidate models for how language is processed in the brain. These models are often trained without explicit linguistic supervision, but have been shown to learn some linguistic structure in the absence of such supervision (Manning et. al, 2020), potentially questioning the relevance of symbolic linguistic theories in modeling such cognitive processes (Warstadt &amp; Bowman, 2020). We evaluate across two fMRI datasets whether language models align better with brain recordings, if their attention is biased by annotations from syntactic or semantic formalisms. Using structure from dependency or minimal recursion semantic annotations, we find alignments improve significantly for one of the datasets. For another dataset, we see more mixed results. We present an extensive analysis of these results. Our proposed approach enables the evaluation of more targeted hypotheses about the composition of meaning in the brain, expanding the range of possible scientific inferences a neuroscientist could make, and opens up new opportunities for cross-pollination between computational neuroscience and linguistics.\n\n",
+    "title": "Does injecting linguistic structure into language models lead to better alignment with brain recordings?",
+    "authors": [
+      "Mostafa Abdou",
+      "Ana Valeria Gonz\u00e1lez",
+      "Mariya K Toneva",
+      "Daniel Hershcovich",
+      "Anders S\u00f8gaard"
+    ],
+    "emails": [
+      "~Daniel_Hershcovich1",
+      "~Mostafa_Abdou2",
+      "~Anders_S\u00f8gaard1",
+      "~Mariya_K_Toneva1",
+      "di.ku.dk"
+    ],
+    "rank": 600
+  },
+  {
+    "url": "https://openreview.net/forum?id=XjYgR6gbCEc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.31",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Data augmentation is an efficient way to expand a training dataset by creating additional artificial data. While data augmentation is found to be effective in improving the generalization capabilities of models for various machine learning tasks, the underlying augmentation methods are usually manually designed and carefully evaluated for each data modality separately, like image processing functions for image data and word-replacing rules for text data. In this work, we propose an automated data augmentation approach called MODALS (Modality-agnostic Automated Data Augmentation in the Latent Space) to augment data for any modality in a generic way. MODALS exploits automated data augmentation to fine-tune four universal data transformation operations in the latent space to adapt the transform to data of different modalities. Through comprehensive experiments, we demonstrate the effectiveness of MODALS on multiple datasets for text, tabular, time-series and image modalities.",
+    "title": "MODALS: Modality-agnostic Automated Data Augmentation in the Latent Space",
+    "authors": [
+      "Tsz-Him Cheung",
+      "Dit-Yan Yeung"
+    ],
+    "emails": [
+      "~Tsz-Him_Cheung1",
+      "~Dit-Yan_Yeung2"
+    ],
+    "rank": 601
+  },
+  {
+    "url": "https://openreview.net/forum?id=hPWj1qduVw8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7
+    ],
+    "rating": "6.31",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Compared to traditional visual question answering, video-grounded dialogues require additional reasoning over dialogue context to answer questions in a multi-turn setting. Previous approaches to video-grounded dialogues mostly use dialogue context as a simple text input without modelling the inherent information flows at the turn level. In this paper, we propose a novel framework of Reasoning Paths in Dialogue Context (PDC). PDC model discovers information flows among dialogue turns through a semantic graph constructed based on lexical components in each question and answer. PDC model then learns to predict reasoning paths over this semantic graph. Our path prediction model predicts a path from the current turn through past dialogue turns that contain additional visual cues to answer the current question. Our reasoning model sequentially processes both visual and textual information through this reasoning path and the propagated features are used to generate the answer. Our experimental results demonstrate the effectiveness of our method and provide additional insights on how models use semantic dependencies in a dialogue context to retrieve visual cues.",
+    "title": "Learning Reasoning Paths over Semantic Graphs for Video-grounded Dialogues",
+    "authors": [
+      "Hung Le",
+      "Nancy F. Chen",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Nancy_F._Chen1",
+      "~Hung_Le2"
+    ],
+    "rank": 602
+  },
+  {
+    "url": "https://openreview.net/forum?id=aGfU_xziEX8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.31",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Hawkes process provides an effective statistical framework for analyzing the time-dependent interaction of neuronal spiking activities. Although utilized in many real applications, the classic Hawkes process is incapable of modelling inhibitory interactions among neurons. Instead, the nonlinear Hawkes process allows for a more flexible influence pattern with excitatory or inhibitory interactions. In this paper, three sets of auxiliary latent variables (Polya-Gamma variables, latent marked Poisson processes and sparsity variables) are augmented to make functional connection weights in a Gaussian form, which allows for a simple iterative algorithm with analytical updates. As a result, an efficient expectation-maximization (EM) algorithm is derived to obtain the maximum a posteriori (MAP) estimate. We demonstrate the accuracy and efficiency performance of our algorithm on synthetic and real data. For real neural recordings, we show our algorithm can estimate the temporal dynamics of interaction and reveal the interpretable functional connectivity underlying neural spike trains. ",
+    "title": "Efficient Inference of Flexible Interaction in Spiking-neuron Networks",
+    "authors": [
+      "Feng Zhou",
+      "Yixuan Zhang",
+      "Jun Zhu"
+    ],
+    "emails": [
+      "~Feng_Zhou9",
+      "uts.edu.au",
+      "~Jun_Zhu2"
+    ],
+    "rank": 603
+  },
+  {
+    "url": "https://openreview.net/forum?id=F-mvpFpn_0q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      7,
+      4
+    ],
+    "rating": "6.31",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "We propose the challenge of rapid task-solving in novel environments (RTS), wherein an agent must solve a series of tasks as rapidly as possible in an unfamiliar environment. An effective RTS agent must balance between exploring the unfamiliar environment and solving its current task, all while building a model of the new environment over which it can plan when faced with later tasks. While modern deep RL agents exhibit some of these abilities in isolation, none are suitable for the full RTS challenge. To enable progress toward RTS, we introduce two challenge domains: (1) a minimal RTS challenge called the Memory&amp;Planning Game and (2) One-Shot StreetLearn Navigation, which introduces scale and complexity from real-world data. We demonstrate that state-of-the-art deep RL agents fail at RTS in both domains, and that this failure is due to an inability to plan over gathered knowledge. We develop Episodic Planning Networks (EPNs) and show that deep-RL agents with EPNs excel at RTS, outperforming the nearest baseline by factors of 2-3 and learning to navigate held-out StreetLearn maps within a single episode. We show that EPNs learn to execute a value iteration-like planning algorithm and that they generalize to situations beyond their training experience.",
+    "title": "Rapid Task-Solving in Novel Environments",
+    "authors": [
+      "Samuel Ritter",
+      "Ryan Faulkner",
+      "Laurent Sartran",
+      "Adam Santoro",
+      "Matthew Botvinick",
+      "David Raposo"
+    ],
+    "emails": [
+      "~Samuel_Ritter1",
+      "~Matthew_Botvinick1",
+      "~Laurent_Sartran1",
+      "~Ryan_Faulkner2",
+      "~Adam_Santoro1",
+      "~David_Raposo1"
+    ],
+    "rank": 604
+  },
+  {
+    "url": "https://openreview.net/forum?id=uFkGzn9RId8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.31",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Partial observability remains a major challenge for reinforcement learning (RL). In fully observable environments it is sufficient for RL agents to learn memoryless policies. However, some form of memory is necessary when RL agents are faced with partial observability. In this paper we study a lightweight approach: we augment the environment with an external memory and additional actions to control what, if anything, is written to the memory. At every step, the current memory state is part of the agent\u2019s observation, and the agent selects a tuple of actions: one action that modifies the environment and another that modifies the memory. When the external memory is sufficiently expressive, optimal memoryless policies yield globally optimal solutions. We develop the theory for memory-augmented environments and formalize the RL problem. Previous attempts to use external memory in the form of binary memory have produced poor results in practice. We propose and experimentally evaluate alternative forms of k-size buffer memory where the agent can decide to remember observations by pushing (or not) them into the buffer. Our memories are simple to implement and outperform binary and LSTM-based memories in well-established partially observable domains.",
+    "title": "The act of remembering: A study in partially observable reinforcement learning",
+    "authors": [
+      "Rodrigo Toro Icarte",
+      "Richard Valenzano",
+      "Toryn Q. Klassen",
+      "Phillip Christoffersen",
+      "Amir-massoud Farahmand",
+      "Sheila A. McIlraith"
+    ],
+    "emails": [
+      "~Toryn_Q._Klassen1",
+      "~Phillip_Christoffersen1",
+      "~Sheila_A._McIlraith1",
+      "~Amir-massoud_Farahmand1",
+      "~Richard_Valenzano1",
+      "~Rodrigo_Toro_Icarte1"
+    ],
+    "rank": 605
+  },
+  {
+    "url": "https://openreview.net/forum?id=7I12hXRi8F",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.31",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "In the era of causal revolution, identifying the causal effect of an exposure on the outcome of interest is an important problem in many areas, such as epidemics, medicine, genetics, and economics. Under a general causal graph, the exposure may have a direct effect on the outcome and also an indirect effect regulated by a set of mediators. An analysis of causal effects that interprets the causal mechanism contributed through mediators is hence challenging but on demand. To the best of our knowledge, there are no feasible algorithms that give an exact decomposition of the indirect effect on the level of individual mediators, due to common interaction among mediators in the complex graph. In this paper, we establish a new statistical framework to comprehensively characterize causal effects with multiple mediators, namely, ANalysis Of Causal Effects (ANOCE), with a newly introduced definition of the mediator effect, under the linear structure equation model. We further propose a constrained causal structure learning method by incorporating a novel identification constraint that specifies the temporal causal relationship of variables. The proposed algorithm is applied to investigate the causal effects of 2020 Hubei lockdowns on reducing the spread of the coronavirus in Chinese major cities out of Hubei. ",
+    "title": "ANOCE: Analysis of Causal Effects with Multiple Mediators via Constrained Structural Learning",
+    "authors": [
+      "Hengrui Cai",
+      "Rui Song",
+      "Wenbin Lu"
+    ],
+    "emails": [
+      "~Rui_Song2",
+      "ncsu.edu",
+      "~Hengrui_Cai1"
+    ],
+    "rank": 606
+  },
+  {
+    "url": "https://openreview.net/forum?id=uIc4W6MtbDA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.30",
+    "confidences": [
+      3,
+      1,
+      3,
+      3
+    ],
+    "abstract": "Policies for real-world multi-agent problems, such as optimal taxation, can be learned in multi-agent simulations with AI agents that emulate humans. However, simulations can suffer from reality gaps as humans often act suboptimally or optimize for different objectives (i.e., bounded rationality). We introduce <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-Robust Multi-Agent Simulation (ERMAS), a robust optimization framework to learn AI policies that are robust to such multi-agent reality gaps. The objective of ERMAS theoretically guarantees robustness to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-Nash equilibria of other agents \u2013 that is, robustness to behavioral deviations with a regret of at most <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>. ERMAS efficiently solves a first-order approximation of the robustness objective using meta-learning methods. We show that ERMAS yields robust policies for repeated bimatrix games and optimal adaptive taxation in economic simulations, even when baseline notions of robustness are uninformative or intractable. In particular, we show ERMAS can learn tax policies that are robust to changes in agent risk aversion, improving policy objectives (social welfare) by up to 15% in complex spatiotemporal simulations using the AI Economist (Zheng et al., 2020).",
+    "title": "ERMAS: Learning Policies Robust to Reality Gaps in Multi-Agent Simulations",
+    "authors": [
+      "Eric Zhao",
+      "Alexander R Trott",
+      "Caiming Xiong",
+      "Stephan Zheng"
+    ],
+    "emails": [
+      "~Alexander_R_Trott1",
+      "~Caiming_Xiong1",
+      "~Eric_Zhao1",
+      "~Stephan_Zheng1"
+    ],
+    "rank": 607
+  },
+  {
+    "url": "https://openreview.net/forum?id=V69LGwJ0lIN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.30",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Reinforcement learning (RL) has achieved impressive performance in a variety of online settings in which an agent\u2019s ability to query the environment for transitions and rewards is effectively unlimited. However, in many practical applications, the situation is reversed:  an agent may have access to large amounts of undirected offline experience data, while access to the online environment is severely limited. In this work, we focus on this offline setting. Our main insight is that, when presented with offline data composed of a variety of behaviors, an effective way to leverage this data is to extract a continuous space of recurring and temporally extended primitive behaviors before using these primitives for downstream task learning.  Primitives extracted in this way serve two purposes:  they delineate the behaviors that are supported by the data from those that are not, making them useful for avoiding distributional shift in offline RL; and they provide a degree of temporal abstraction, which reduces the effective horizon yielding better learning in theory, and improved offline RL in practice. In addition to benefiting offline policy optimization, we show that performing offline primitive learning in this way can also be leveraged for improving few-shot imitation learning as well as exploration and transfer in online RL on a variety of benchmark domains. Visualizations and code are available at <a href=\"https://sites.google.com/view/opal-iclr\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/opal-iclr</a>",
+    "title": "OPAL: Offline Primitive Discovery for Accelerating Offline Reinforcement Learning",
+    "authors": [
+      "Anurag Ajay",
+      "Aviral Kumar",
+      "Pulkit Agrawal",
+      "Sergey Levine",
+      "Ofir Nachum"
+    ],
+    "emails": [
+      "~Ofir_Nachum1",
+      "~Aviral_Kumar2",
+      "~Anurag_Ajay1",
+      "~Pulkit_Agrawal1",
+      "~Sergey_Levine1"
+    ],
+    "rank": 608
+  },
+  {
+    "url": "https://openreview.net/forum?id=D51irFX8UOG",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.30",
+    "confidences": [
+      2,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Humans learn compositional and causal abstraction, \\ie, knowledge, in response to the structure of naturalistic tasks. When presented with a problem-solving task involving some objects, toddlers would first interact with these objects to reckon what they are and what can be done with them. Leveraging these concepts, they could understand the internal structure of this task, without seeing all of the problem instances. Remarkably, they further build cognitively executable strategies to \\emph{rapidly} solve novel problems. To empower a learning agent with similar capability, we argue there shall be three levels of generalization in how an agent represents its knowledge: perceptual, conceptual, and algorithmic. In this paper, we devise the very first systematic benchmark that offers joint evaluation covering all three levels. This benchmark is centered around a novel task domain, HALMA, for visual concept development and rapid problem solving. Uniquely, HALMA has a minimum yet complete concept space, upon which we introduce a novel paradigm to rigorously diagnose and dissect learning agents' capability in understanding and generalizing complex and structural concepts. We conduct extensive experiments on reinforcement learning agents with various inductive biases and carefully report their proficiency and weakness.",
+    "title": "HALMA: Humanlike Abstraction Learning Meets Affordance in Rapid Problem Solving",
+    "authors": [
+      "Sirui Xie",
+      "Xiaojian Ma",
+      "Peiyu Yu",
+      "Yixin Zhu",
+      "Ying Nian Wu",
+      "Song-Chun Zhu"
+    ],
+    "emails": [
+      "~Song-Chun_Zhu1",
+      "~Yixin_Zhu1",
+      "~Peiyu_Yu1",
+      "~Xiaojian_Ma1",
+      "~Sirui_Xie1",
+      "~Ying_Nian_Wu1"
+    ],
+    "rank": 609
+  },
+  {
+    "url": "https://openreview.net/forum?id=AM0PBmqmojH",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.29",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Optimal transport (OT) is a cornerstone of many machine learning tasks. The current best practice for computing OT is via entropy regularization and Sinkhorn iterations. This algorithm runs in quadratic time and requires calculating the full pairwise cost matrix, which is prohibitively expensive for large sets of objects. To alleviate this limitation we propose to instead use a sparse approximation of the cost matrix based on locality sensitive hashing (LSH). Moreover, we fuse this sparse approximation with the Nystr\u00f6m method, resulting in the locally corrected Nystr\u00f6m method (LCN). These approximations enable general log-linear time algorithms for entropy-regularized OT that perform well even in complex, high-dimensional spaces. We thoroughly demonstrate these advantages via a theoretical analysis and by evaluating multiple approximations both directly and as a component of two real-world models. Using approximate Sinkhorn for unsupervised word embedding alignment enables us to train the model full-batch in a fraction of the time while improving upon the original on average by 3.1 percentage points without any model changes. For graph distance regression we propose the graph transport network (GTN), which combines graph neural networks (GNNs) with enhanced Sinkhorn and outcompetes previous models by 48%. LCN-Sinkhorn enables GTN to achieve this while still scaling log-linearly in the number of nodes.",
+    "title": "Warpspeed Computation of Optimal Transport, Graph Distances, and Embedding Alignment",
+    "authors": [
+      "Johannes Klicpera",
+      "Marten Lienen",
+      "Stephan G\u00fcnnemann"
+    ],
+    "emails": [
+      "~Johannes_Klicpera1",
+      "in.tum.de",
+      "~Stephan_G\u00fcnnemann1"
+    ],
+    "rank": 610
+  },
+  {
+    "url": "https://openreview.net/forum?id=Drynvt7gg4L",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      6,
+      7
+    ],
+    "rating": "6.29",
+    "confidences": [
+      5,
+      5,
+      2,
+      5
+    ],
+    "abstract": "Custom voice, a specific text to speech (TTS) service in commercial speech platforms, aims to adapt a source TTS model to synthesize personal voice for a target speaker using few speech from her/him. Custom voice presents two unique challenges for TTS adaptation: 1) to support diverse customers, the adaptation model needs to handle diverse acoustic conditions which could be very different from source speech data, and 2) to support a large number of customers, the adaptation parameters need to be small enough for each target speaker to reduce memory usage while maintaining high voice quality. In this work, we propose AdaSpeech, an adaptive TTS system for high-quality and efficient customization of new voices. We design several techniques in AdaSpeech to address the two challenges in custom voice: 1) To handle different acoustic conditions, we model the acoustic information in both utterance and phoneme level. Specifically, we use one acoustic encoder to extract an utterance-level vector and another one to extract a sequence of phoneme-level vectors from the target speech during pre-training and fine-tuning; in inference, we extract the utterance-level vector from a reference speech and use an acoustic predictor to predict the phoneme-level vectors. 2) To better trade off the adaptation parameters and voice quality, we introduce conditional layer normalization in the mel-spectrogram decoder of AdaSpeech, and fine-tune this part in addition to speaker embedding for adaptation. We pre-train the source TTS model on LibriTTS datasets and fine-tune it on VCTK and LJSpeech datasets (with different acoustic conditions from LibriTTS) with few adaptation data, e.g., 20 sentences, about 1 minute speech. Experiment results show that AdaSpeech achieves much better adaptation quality than baseline methods, with only about 5K specific parameters for each speaker, which demonstrates its effectiveness for custom voice. The audio samples are available at <a href=\"https://speechresearch.github.io/adaspeech/\" target=\"_blank\" rel=\"nofollow\">https://speechresearch.github.io/adaspeech/</a>.",
+    "title": "AdaSpeech: Adaptive Text to Speech for Custom Voice",
+    "authors": [
+      "Mingjian Chen",
+      "Xu Tan",
+      "Bohan Li",
+      "Yanqing Liu",
+      "Tao Qin",
+      "sheng zhao",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Xu_Tan1",
+      "microsoft.com",
+      "~Tie-Yan_Liu1",
+      "~sheng_zhao1"
+    ],
+    "rank": 611
+  },
+  {
+    "url": "https://openreview.net/forum?id=4TSiOTkKe5P",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.29",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Discovering causal structures of temporal processes is a major tool of scientific inquiry because it helps us better understand and explain the mechanisms driving a phenomenon of interest, thereby facilitating analysis, reasoning, and synthesis for such systems. \nHowever, accurately inferring causal structures within a phenomenon based on observational data only is still an open problem. Indeed, this type of data usually consists in short time series with missing or noisy values for which causal inference is increasingly difficult. In this work, we propose a method to uncover causal relations in chaotic dynamical systems from short, noisy and sporadic time series (that is, incomplete observations at infrequent and irregular intervals) where the classical convergent cross mapping (CCM) fails. Our method works by learning a Neural ODE latent process modeling the state-space dynamics of the time series and by checking the existence of a continuous map between the resulting processes. We provide theoretical analysis and show empirically that Latent-CCM can reliably uncover the true causal pattern, unlike traditional methods.",
+    "title": "Latent Convergent Cross Mapping",
+    "authors": [
+      "Edward De Brouwer",
+      "Adam Arany",
+      "Jaak Simm",
+      "Yves Moreau"
+    ],
+    "emails": [
+      "~Adam_Arany1",
+      "~Edward_De_Brouwer1",
+      "~Yves_Moreau2",
+      "~Jaak_Simm1"
+    ],
+    "rank": 612
+  },
+  {
+    "url": "https://openreview.net/forum?id=R0a0kFI3dJx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.29",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "We present a new family of min-max optimization algorithms that automatically exploit the geometry of the gradient data observed at earlier iterations to perform more informative extra-gradient steps in later ones.\nThanks to this adaptation mechanism, the proposed method automatically detects whether the problem is smooth or not, without requiring any prior tuning by the optimizer.\nAs a result, the algorithm simultaneously achieves order-optimal convergence rates, \\ie it  converges to an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-optimal solution within <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03b5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> iterations in smooth problems, and within <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><msup><mi>\u03b5</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> iterations in non-smooth ones. Importantly, these guarantees do not require any of the standard boundedness or Lipschitz continuity conditions that are typically assumed in the literature; in particular, they apply even to problems with singularities (such as resource allocation problems and the like). This adaptation is achieved through the use of a geometric apparatus based on Finsler metrics and a suitably chosen mirror-prox template that allows us to derive sharp convergence rates for the methods at hand.",
+    "title": "Adaptive Extra-Gradient Methods for Min-Max Optimization and Games",
+    "authors": [
+      "Kimon Antonakopoulos",
+      "Veronica Belmega",
+      "Panayotis Mertikopoulos"
+    ],
+    "emails": [
+      "~Panayotis_Mertikopoulos1",
+      "~Veronica_Belmega1",
+      "~Kimon_Antonakopoulos1"
+    ],
+    "rank": 613
+  },
+  {
+    "url": "https://openreview.net/forum?id=Nj8EIrSu5O",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      8
+    ],
+    "rating": "6.29",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Standard planners for sequential decision making (including Monte Carlo planning, tree search, dynamic programming, etc.) are constrained by an implicit sequential planning assumption: The order in which a plan is constructed is the same in which it is executed.\nWe consider alternatives to this assumption for the class of goal-directed Reinforcement Learning (RL) problems.\nInstead of an environment transition model, we assume an imperfect, goal-directed policy.\nThis low-level policy can be improved by a plan, consisting of an appropriate sequence of sub-goals that guide it from the start to the goal state. We propose a planning algorithm, Divide-and-Conquer Monte Carlo Tree Search (DC-MCTS), for approximating the optimal plan by means of proposing intermediate sub-goals which hierarchically partition the initial tasks into simpler ones that are then solved independently and recursively. The algorithm critically makes use of a learned sub-goal proposal for finding appropriate partitions trees of new tasks based on prior experience.\nDifferent strategies for learning sub-goal proposals give rise to different planning strategies that strictly generalize sequential planning.\nWe show that this algorithmic flexibility over planning order leads to improved results in navigation tasks in grid-worlds as well as in challenging continuous control environments.",
+    "title": "Divide-and-Conquer Monte Carlo Tree Search",
+    "authors": [
+      "Giambattista Parascandolo",
+      "Lars Holger Buesing",
+      "Josh Merel",
+      "Leonard Hasenclever",
+      "John Aslanides",
+      "Jessica B Hamrick",
+      "Nicolas Heess",
+      "Alexander Neitz",
+      "Theophane Weber"
+    ],
+    "emails": [
+      "~John_Aslanides1",
+      "~Lars_Holger_Buesing1",
+      "~Leonard_Hasenclever1",
+      "~Giambattista_Parascandolo1",
+      "~Nicolas_Heess1",
+      "~Josh_Merel1",
+      "~Theophane_Weber1",
+      "~Alexander_Neitz1",
+      "~Jessica_B_Hamrick1"
+    ],
+    "rank": 614
+  },
+  {
+    "url": "https://openreview.net/forum?id=bM3L3I_853",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.29",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Temporal modelling is the key for efficient video action recognition. While understanding temporal information can improve recognition accuracy for dynamic actions, removing temporal redundancy and reusing past features can significantly save computation leading to efficient action recognition. In this paper, we introduce an adaptive temporal fusion network, called AdaFuse, that dynamically fuses channels from current and past feature maps for strong temporal modelling. Specifically, the necessary information from the historical convolution feature maps is fused with current pruned feature maps with the goal of improving both recognition accuracy and efficiency. In addition, we use a skipping operation to further reduce the computation cost of action recognition. Extensive experiments on SomethingV1 &amp; V2, Jester and Mini-Kinetics show that our approach can achieve about 40% computation savings with comparable accuracy to state-of-the-art methods. The project page can be found at <a href=\"https://mengyuest.github.io/AdaFuse/\" target=\"_blank\" rel=\"nofollow\">https://mengyuest.github.io/AdaFuse/</a>",
+    "title": "AdaFuse: Adaptive Temporal Fusion Network for Efficient Action Recognition",
+    "authors": [
+      "Yue Meng",
+      "Rameswar Panda",
+      "Chung-Ching Lin",
+      "Prasanna Sattigeri",
+      "Leonid Karlinsky",
+      "Kate Saenko",
+      "Aude Oliva",
+      "Rogerio Feris"
+    ],
+    "emails": [
+      "~Leonid_Karlinsky3",
+      "~Rameswar_Panda1",
+      "~Chung-Ching_Lin2",
+      "~Yue_Meng1",
+      "~Kate_Saenko1",
+      "~Rogerio_Feris1",
+      "~Aude_Oliva1",
+      "~Prasanna_Sattigeri1"
+    ],
+    "rank": 615
+  },
+  {
+    "url": "https://openreview.net/forum?id=u2YNJPcQlwq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      5
+    ],
+    "rating": "6.29",
+    "confidences": [
+      3,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Intrinsically motivated artificial agents learn advantageous behavior without externally-provided rewards. Previously, it was shown that maximizing mutual information between agent actuators and future states, known as the empowerment principle, enables unsupervised stabilization of dynamical systems at upright positions, which is a prototypical intrinsically motivated behavior for upright standing and walking. This follows from the coincidence between the objective of stabilization and the objective of empowerment. Unfortunately, sample-based estimation of this kind of mutual information is challenging. Recently, various variational lower bounds (VLBs) on empowerment have been proposed as solutions; however, they are often biased, unstable in training, and have high sample complexity. In this work, we propose an alternative solution based on a trainable representation of a dynamical system as a Gaussian channel, which allows us to efficiently calculate an unbiased estimator of empowerment by convex optimization. We demonstrate our solution for sample-based unsupervised stabilization on different dynamical control systems and show the advantages of our method by comparing it to the existing VLB approaches. Specifically, we show that our method has a lower sample complexity, is more stable in training, possesses the essential properties of the empowerment function, and allows estimation of empowerment from images. Consequently, our method opens a path to wider and easier adoption of empowerment for various applications.",
+    "title": "Efficient Empowerment Estimation for Unsupervised Stabilization",
+    "authors": [
+      "Ruihan Zhao",
+      "Kevin Lu",
+      "Pieter Abbeel",
+      "Stas Tiomkin"
+    ],
+    "emails": [
+      "~Stas_Tiomkin1",
+      "~Kevin_Lu2",
+      "~Pieter_Abbeel2",
+      "~Ruihan_Zhao1"
+    ],
+    "rank": 616
+  },
+  {
+    "url": "https://openreview.net/forum?id=SlrqM9_lyju",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.29",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "The learning rate (LR) schedule is one of the most important hyper-parameters needing careful tuning in training DNNs. However, it is also one of the least automated parts of machine learning systems and usually costs significant manual effort and computing. Though there are pre-defined LR schedules and optimizers with adaptive LR, they introduce new hyperparameters that need to be tuned separately for different tasks/datasets. In this paper, we consider the question: Can we automatically tune the LR over the course of training without human involvement? We propose an efficient method, AutoLRS, which automatically optimizes the LR for each training stage by modeling training dynamics. AutoLRS aims to find an LR that minimizes the validation loss, every <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c4</mi></math></mjx-assistive-mml></mjx-container> steps. We formulate it as black-box optimization and solve it by Bayesian optimization (BO). However, collecting training instances for BO requires a system to evaluate each LR queried by BO's acquisition function for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c4</mi></math></mjx-assistive-mml></mjx-container> steps, which is prohibitively expensive in practice. Instead, we apply each candidate LR for only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03c4</mi><mo>\u2032</mo></msup><mo>\u226a</mo><mi>\u03c4</mi></math></mjx-assistive-mml></mjx-container> steps and train an exponential model to predict the validation loss after <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c4</mi></math></mjx-assistive-mml></mjx-container> steps. This mutual-training process between BO and the exponential model allows us to bound the number of training steps invested in the BO search. We demonstrate the advantages and the generality of AutoLRS through extensive experiments of training DNNs from diverse domains and using different optimizers. The LR schedules auto-generated by AutoLRS leads to a speedup of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.22</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.43</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.5</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> when training ResNet-50, Transformer, and BERT, respectively, compared to the LR schedules in their original papers, and an average speedup of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.31</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> over state-of-the-art highly tuned LR schedules.",
+    "title": "AutoLRS: Automatic Learning-Rate Schedule by Bayesian Optimization on the Fly",
+    "authors": [
+      "Yuchen Jin",
+      "Tianyi Zhou",
+      "Liangyu Zhao",
+      "Yibo Zhu",
+      "Chuanxiong Guo",
+      "Marco Canini",
+      "Arvind Krishnamurthy"
+    ],
+    "emails": [
+      "~Arvind_Krishnamurthy1",
+      "~Marco_Canini1",
+      "cs.washington.edu",
+      "bytedance.com",
+      "~Tianyi_Zhou1",
+      "~Yuchen_Jin1"
+    ],
+    "rank": 617
+  },
+  {
+    "url": "https://openreview.net/forum?id=NjF772F4ZZR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.29",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Multi-objective optimization (MOO) problems are prevalent in machine learning. These problems have a set of optimal solutions, called the Pareto front, where each point on the front represents a different trade-off between possibly conflicting objectives. Recent MOO methods can target a specific desired ray in loss space however, most approaches still face two grave limitations: (i) A separate model has to be trained for each point on the front; and (ii) The exact trade-off must be known before the optimization process. Here, we tackle the problem of learning the entire Pareto front, with the capability of selecting a desired operating point on the front after training. We call this new setup Pareto-Front Learning (PFL).\n\nWe describe an approach to PFL implemented using HyperNetworks, which we term Pareto HyperNetworks (PHNs). PHN learns the entire Pareto front simultaneously using a single hypernetwork, which receives as input a desired preference vector and returns a Pareto-optimal model whose loss vector is in the desired ray. The unified model is runtime efficient compared to training multiple models and generalizes to new operating points not used during training. We evaluate our method on a wide set of problems, from multi-task regression and classification to fairness. PHNs learn the entire Pareto front at roughly the same time as learning a single point on the front and at the same time reach a better solution set. PFL opens the door to new applications where models are selected based on preferences that are only available at run time.",
+    "title": "Learning the Pareto Front with Hypernetworks",
+    "authors": [
+      "Aviv Navon",
+      "Aviv Shamsian",
+      "Ethan Fetaya",
+      "Gal Chechik"
+    ],
+    "emails": [
+      "~Ethan_Fetaya1",
+      "~Aviv_Shamsian1",
+      "~Gal_Chechik1",
+      "~Aviv_Navon1"
+    ],
+    "rank": 618
+  },
+  {
+    "url": "https://openreview.net/forum?id=TBIzh9b5eaz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      5,
+      8,
+      6
+    ],
+    "rating": "6.29",
+    "confidences": [
+      3,
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Training Reinforcement Learning (RL) agents in high-stakes applications might be too prohibitive due to the risk associated to exploration. Thus, the agent can only use data previously collected by safe policies. While previous work considers optimizing the average performance using offline data, we focus on optimizing a risk-averse criteria, namely the CVaR. In particular, we present the Offline Risk-Averse Actor-Critic (O-RAAC), a model-free RL algorithm that is able to learn risk-averse policies in a fully offline setting. We show that O-RAAC learns policies with higher CVaR than risk-neutral approaches in different robot control tasks. Furthermore, considering risk-averse criteria guarantees distributional robustness of the average performance with respect to particular distribution shifts. We demonstrate empirically that in the presence of natural distribution-shifts, O-RAAC learns policies with good average performance. \n",
+    "title": "Risk-Averse Offline Reinforcement Learning",
+    "authors": [
+      "N\u00faria Armengol Urp\u00ed",
+      "Sebastian Curi",
+      "Andreas Krause"
+    ],
+    "emails": [
+      "~Sebastian_Curi1",
+      "~Andreas_Krause1",
+      "~N\u00faria_Armengol_Urp\u00ed1"
+    ],
+    "rank": 619
+  },
+  {
+    "url": "https://openreview.net/forum?id=1rxHOBjeDUW",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.29",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We propose a novel information bottleneck (IB) method named Drop-Bottleneck, which discretely drops features that are irrelevant to the target variable. Drop-Bottleneck not only enjoys a simple and tractable compression objective but also additionally provides a deterministic compressed representation of the input variable, which is useful for inference tasks that require consistent representation. Moreover, it can jointly learn a feature extractor and select features considering each feature dimension's relevance to the target task, which is unattainable by most neural network-based IB methods. We propose an exploration method based on Drop-Bottleneck for reinforcement learning tasks. In a multitude of noisy and reward sparse maze navigation tasks in VizDoom (Kempka et al., 2016) and DMLab (Beattie et al., 2016), our exploration method achieves state-of-the-art performance. As a new IB framework, we demonstrate that Drop-Bottleneck outperforms Variational Information Bottleneck (VIB) (Alemi et al., 2017) in multiple aspects including adversarial robustness and dimensionality reduction.",
+    "title": "Drop-Bottleneck: Learning Discrete Compressed Representation for Noise-Robust Exploration",
+    "authors": [
+      "Jaekyeom Kim",
+      "Minjung Kim",
+      "Dongyeon Woo",
+      "Gunhee Kim"
+    ],
+    "emails": [
+      "~Gunhee_Kim1",
+      "~Minjung_Kim2",
+      "~Dongyeon_Woo1",
+      "~Jaekyeom_Kim1"
+    ],
+    "rank": 620
+  },
+  {
+    "url": "https://openreview.net/forum?id=5Y21V0RDBV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Multiple data types naturally co-occur when describing real-world phenomena and learning from them is a long-standing goal in machine learning research. However, existing self-supervised generative models approximating an ELBO are not able to fulfill all desired requirements of multimodal models: their posterior approximation functions lead to a trade-off between the semantic coherence and the ability to learn the joint data distribution. We propose a new, generalized ELBO formulation for multimodal data that overcomes these limitations. The new objective encompasses two previous methods as special cases and combines their benefits without compromises. In extensive experiments, we demonstrate the advantage of the proposed method compared to state-of-the-art models in self-supervised, generative learning tasks.",
+    "title": "Generalized Multimodal ELBO",
+    "authors": [
+      "Thomas M. Sutter",
+      "Imant Daunhawer",
+      "Julia E Vogt"
+    ],
+    "emails": [
+      "~Imant_Daunhawer2",
+      "~Thomas_Marco_Sutter1",
+      "~Julia_E_Vogt1"
+    ],
+    "rank": 621
+  },
+  {
+    "url": "https://openreview.net/forum?id=ijJZbomCJIm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Transfer learning has emerged as a powerful methodology for adapting pre-trained deep neural networks on image recognition tasks to new domains. This process consists of taking a neural network pre-trained on a large feature-rich source dataset, freezing the early layers that encode essential generic image properties, and then fine-tuning the last few layers in order to capture specific information related to the target situation. This approach is particularly useful when only limited or weakly labeled data are available for the new task. In this work, we demonstrate that adversarially-trained models transfer better than non-adversarially-trained models, especially if only limited data are available for the new domain task. Further, we observe that adversarial training biases the learnt representations to retaining shapes, as opposed to textures, which impacts the transferability of the source models. Finally, through the lens of influence functions, we discover that transferred adversarially-trained models contain more human-identifiable semantic information, which explains -- at least partly -- why adversarially-trained models transfer better.",
+    "title": "Adversarially-Trained Deep Nets Transfer Better: Illustration on Image Classification",
+    "authors": [
+      "Francisco Utrera",
+      "Evan Kravitz",
+      "N. Benjamin Erichson",
+      "Rajiv Khanna",
+      "Michael W. Mahoney"
+    ],
+    "emails": [
+      "~N._Benjamin_Erichson1",
+      "~Michael_W._Mahoney1",
+      "berkeley.edu",
+      "~Rajiv_Khanna1",
+      "~Francisco_Utrera1"
+    ],
+    "rank": 622
+  },
+  {
+    "url": "https://openreview.net/forum?id=SZ3wtsXfzQR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Machine learning models have traditionally been developed under the assumption that the training and test distributions match exactly. However, recent success in few-shot learning and related problems are encouraging signs that these models can be adapted to more realistic settings where train and test distributions differ. Unfortunately, there is severely limited theoretical support for these algorithms and little is known about the difficulty of these problems. In this work, we provide novel information-theoretic lower-bounds on minimax rates of convergence for algorithms that are trained on data from multiple sources and tested on novel data. Our bounds depend intuitively on the information shared between sources of data, and characterize the difficulty of learning in this setting for arbitrary algorithms. We demonstrate these bounds on a hierarchical Bayesian model of meta-learning, computing both upper and lower bounds on parameter estimation via maximum-a-posteriori inference.",
+    "title": "Theoretical bounds on estimation error for meta-learning",
+    "authors": [
+      "James Lucas",
+      "Mengye Ren",
+      "Irene Raissa KAMENI KAMENI",
+      "Toniann Pitassi",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Mengye_Ren1",
+      "~James_Lucas1",
+      "~Irene_Raissa_KAMENI_KAMENI1",
+      "~Toniann_Pitassi1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 623
+  },
+  {
+    "url": "https://openreview.net/forum?id=8bZC3CyF-f7",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Reinforcement Learning algorithms require a large number of samples to solve complex tasks with sparse and delayed rewards. \nComplex tasks are often  hierarchically composed of sub-tasks.\nA step in the Q-function indicates solving a sub-task, where the expectation of the return increases. \nRUDDER identifies these steps and then redistributes reward to them, thus immediately giving reward if sub-tasks are solved. \nSince the delay of rewards is reduced, learning is considerably sped up.\nHowever, for complex tasks, current exploration strategies struggle with discovering episodes with high rewards.\nTherefore, we assume that episodes with high rewards are given as demonstrations and do not have to be discovered by exploration.\nTypically the number of demonstrations is small and RUDDER's LSTM model does not learn well.\nHence, we introduce Align-RUDDER, which is RUDDER with two major modifications. \nFirst, Align-RUDDER assumes that episodes with high rewards are given as demonstrations, \nreplacing RUDDER\u2019s safe exploration and lessons replay buffer.\nSecond, we substitute RUDDER\u2019s LSTM model by a profile model that is obtained from multiple sequence alignment of demonstrations. \nProfile models can be constructed from as few as two demonstrations.\nAlign-RUDDER inherits the concept of reward redistribution, which speeds up learning by reducing the delay of rewards. \nAlign-RUDDER outperforms competitors on complex artificial tasks with delayed reward and few demonstrations.\nOn the MineCraft ObtainDiamond task, Align-RUDDER is able to mine a diamond, though not frequently.",
+    "title": "Align-RUDDER: Learning From Few Demonstrations by Reward Redistribution",
+    "authors": [
+      "Vihang Prakash Patil",
+      "Markus Hofmarcher",
+      "Marius-Constantin Dinu",
+      "Matthias Dorfer",
+      "Patrick M Blies",
+      "Johannes Brandstetter",
+      "Jose Arjona-Medina",
+      "Sepp Hochreiter"
+    ],
+    "emails": [
+      "ml.jku.at",
+      "~Johannes_Brandstetter1",
+      "~Markus_Hofmarcher1",
+      "~Matthias_Dorfer1",
+      "~Sepp_Hochreiter1",
+      "~Vihang_Prakash_Patil1",
+      "~Jose_Arjona-Medina1",
+      "~Patrick_M_Blies1"
+    ],
+    "rank": 624
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ldau9eHU-qO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Robotic manipulation tasks, such as wiping with a soft sponge, require control from multiple rich sensory modalities. Human-robot interaction, aimed at teach- ing robots, is difficult in this setting as there is potential for mismatch between human and machine comprehension of the rich data streams. We treat the task of interpretable learning from demonstration as an optimisation problem over a probabilistic generative model. To account for the high-dimensionality of the data, a high-capacity neural network is chosen to represent the model. The latent variables in this model are explicitly aligned with high-level notions and concepts that are manifested in a set of demonstrations. We show that such alignment is best achieved through the use of labels from the end user, in an appropriately restricted vocabulary, in contrast to the conventional approach of the designer picking a prior over the latent variables. Our approach is evaluated in the context of two table-top robot manipulation tasks performed by a PR2 robot \u2013 that of dabbing liquids with a sponge (forcefully pressing a sponge and moving it along a surface) and pouring between different containers. The robot provides visual information, arm joint positions and arm joint efforts. We have made videos of the tasks and data available - see supplementary materials at: <a href=\"https://sites.google.com/view/weak-label-lfd\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/weak-label-lfd</a>.",
+    "title": "Learning from Demonstration with Weakly Supervised Disentanglement",
+    "authors": [
+      "Yordan Hristov",
+      "Subramanian Ramamoorthy"
+    ],
+    "emails": [
+      "~Yordan_Hristov1",
+      "~Subramanian_Ramamoorthy1"
+    ],
+    "rank": 625
+  },
+  {
+    "url": "https://openreview.net/forum?id=AWOSz_mMAPx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We study the role that a finite timescale separation parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c4</mi></math></mjx-assistive-mml></mjx-container> has on gradient descent-ascent in non-convex, non-concave zero-sum games where the learning rate of player 1 is denoted by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u03b3</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> and the learning rate of player 2 is defined to be <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u03b3</mi><mn>2</mn></msub><mo>=</mo><mi>\u03c4</mi><msub><mi>\u03b3</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container>. We provide a non-asymptotic construction of the finite timescale separation parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03c4</mi><mrow><mo>\u2217</mo></mrow></msup></math></mjx-assistive-mml></mjx-container> such that gradient descent-ascent locally converges to  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mo>\u2217</mo></mrow></msup></math></mjx-assistive-mml></mjx-container> for all <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c4</mi><mo>\u2208</mo><mo stretchy=\"false\">(</mo><msup><mi>\u03c4</mi><mrow><mo>\u2217</mo></mrow></msup><mo>,</mo><mi mathvariant=\"normal\">\u221e</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> if and only if it is a strict local minmax equilibrium. Moreover, we provide explicit local convergence rates given the finite timescale separation. The convergence results we present are complemented by a non-convergence result: given a critical point <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mo>\u2217</mo></mrow></msup></math></mjx-assistive-mml></mjx-container> that is not a strict local minmax equilibrium, we present a non-asymptotic construction of a finite timescale separation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u03c4</mi><mrow><mn>0</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> such that gradient descent-ascent with timescale separation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D70F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c4</mi><mo>\u2208</mo><mo stretchy=\"false\">(</mo><msub><mi>\u03c4</mi><mn>0</mn></msub><mo>,</mo><mi mathvariant=\"normal\">\u221e</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> does not converge to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mo>\u2217</mo></mrow></msup></math></mjx-assistive-mml></mjx-container>. Finally, we extend the results to gradient penalty regularization methods for generative adversarial networks and empirically demonstrate on CIFAR-10 and CelebA the significant impact timescale separation has on training performance. ",
+    "title": "Local Convergence Analysis of Gradient Descent Ascent with Finite Timescale Separation",
+    "authors": [
+      "Tanner Fiez",
+      "Lillian J Ratliff"
+    ],
+    "emails": [
+      "~Lillian_J_Ratliff1",
+      "~Tanner_Fiez1"
+    ],
+    "rank": 626
+  },
+  {
+    "url": "https://openreview.net/forum?id=pwwVuSICBgt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      8
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The ever-growing computational demands of increasingly complex machine learning models frequently necessitate the use of powerful cloud-based infrastructure for their training. Binary neural networks are known to be promising candidates for on-device inference due to their extreme compute and memory savings over higher-precision alternatives. In this paper, we demonstrate that they are also strongly robust to gradient quantization, thereby making the training of modern models on the edge a practical reality. We introduce a low-cost binary neural network training strategy exhibiting sizable memory footprint reductions and energy savings vs Courbariaux &amp; Bengio's standard approach. Against the latter, we see coincident memory requirement and energy consumption drops of 2--6<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>, while reaching similar test accuracy, across a range of small-scale models trained to classify popular datasets. We also showcase ImageNet training of ResNetE-18, achieving a 3.12<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> memory reduction over the aforementioned standard. Such savings will allow for unnecessary cloud offloading to be avoided, reducing latency and increasing energy efficiency while also safeguarding user privacy.",
+    "title": "Enabling Binary Neural Network Training on the Edge",
+    "authors": [
+      "Erwei Wang",
+      "James J. Davis",
+      "Daniele Moro",
+      "Piotr Zielinski",
+      "Claudionor Coelho",
+      "Satrajit Chatterjee",
+      "Peter Y. K. Cheung",
+      "George Anthony Constantinides"
+    ],
+    "emails": [
+      "alumni.stanford.edu",
+      "~Erwei_Wang1",
+      "imperial.ac.uk",
+      "google.com",
+      "~Satrajit_Chatterjee1",
+      "~Piotr_Zielinski1",
+      "~George_Anthony_Constantinides1"
+    ],
+    "rank": 627
+  },
+  {
+    "url": "https://openreview.net/forum?id=TiXl51SCNw8",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Mixed-precision quantization can potentially achieve the optimal tradeoff between performance and compression rate of deep neural networks, and thus, have been widely investigated. However, it lacks a systematic method to determine the exact quantization scheme. Previous methods either examine only a small manually-designed search space or utilize a cumbersome neural architecture search to explore the vast search space. These approaches cannot lead to an optimal quantization scheme efficiently. This work proposes bit-level sparsity quantization (BSQ) to tackle the mixed-precision quantization from a new angle of inducing bit-level sparsity. We consider each bit of quantized weights as an independent trainable variable and introduce a differentiable bit-sparsity regularizer. BSQ can induce all-zero bits across a group of weight elements and realize the dynamic precision reduction, leading to a mixed-precision quantization scheme of the original model. Our method enables the exploration of the full mixed-precision space with a single gradient-based optimization process, with only one hyperparameter to tradeoff the performance and compression. BSQ achieves both higher accuracy and higher bit reduction on various model architectures on the CIFAR-10 and ImageNet datasets comparing to previous methods.",
+    "title": "BSQ: Exploring Bit-Level Sparsity for Mixed-Precision Neural Network Quantization",
+    "authors": [
+      "Huanrui Yang",
+      "Lin Duan",
+      "Yiran Chen",
+      "Hai Li"
+    ],
+    "emails": [
+      "duke.edu",
+      "~Yiran_Chen1",
+      "~Huanrui_Yang1",
+      "~Hai_Li1"
+    ],
+    "rank": 628
+  },
+  {
+    "url": "https://openreview.net/forum?id=U6Xpa5R-E1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose the Neural Potts Model objective as an amortized optimization problem. The objective enables training a single model with shared parameters to explicitly model energy landscapes across multiple protein families. Given a protein sequence as input, the model is trained to predict a pairwise coupling matrix for a Potts model energy function describing the local evolutionary landscape of the sequence. Couplings can be predicted for novel sequences. A controlled ablation experiment assessing unsupervised contact prediction on sets of related protein families finds a gain from amortization for low-depth multiple sequence alignments; the result is then confirmed on a database with broad coverage of protein sequences.",
+    "title": "Neural Potts Model",
+    "authors": [
+      "Tom Sercu",
+      "Robert Verkuil",
+      "Joshua Meier",
+      "Brandon Amos",
+      "Zeming Lin",
+      "Caroline Chen",
+      "Jason Liu",
+      "Yann LeCun",
+      "Alexander Rives"
+    ],
+    "emails": [
+      "~Tom_Sercu1",
+      "~Zeming_Lin1",
+      "~Alexander_Rives1",
+      "~Joshua_Meier1",
+      "mit.edu",
+      "~Yann_LeCun1",
+      "fb.com",
+      "~Brandon_Amos1"
+    ],
+    "rank": 629
+  },
+  {
+    "url": "https://openreview.net/forum?id=YUGG2tFuPM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      6,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Adversarial poisoning attacks distort training data in order to corrupt the test-time behavior of a classifier. A provable defense provides a certificate for each test sample, which is a lower bound on the magnitude of any adversarial distortion of the training set that can corrupt the test sample's classification.\nWe propose two novel provable defenses against poisoning attacks: (i) Deep Partition Aggregation (DPA), a certified defense against a general poisoning threat model, defined as the insertion or deletion of a bounded number of samples to the training set --- by implication, this threat model also includes arbitrary distortions to a bounded number of images and/or labels; and (ii) Semi-Supervised DPA (SS-DPA), a certified defense against label-flipping poisoning attacks. DPA is an ensemble method where base models are trained on partitions of the training set determined by a hash function. DPA is related to both subset aggregation, a well-studied ensemble method in classical machine learning, as well as to randomized smoothing, a popular provable defense against evasion (inference) attacks. Our defense against label-flipping poison attacks, SS-DPA, uses a semi-supervised learning algorithm as its base classifier model: each base classifier is trained using the entire unlabeled training set in addition to the labels for a partition. SS-DPA significantly outperforms the existing certified defense for label-flipping attacks (Rosenfeld et al., 2020) on both MNIST and CIFAR-10: provably tolerating, for at least half of test images, over 600 label flips (vs. &lt; 200 label flips) on MNIST and over 300 label flips (vs. 175 label flips) on CIFAR-10. Against general poisoning attacks where no prior certified defenses exists, DPA can certify <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2265</mo></math></mjx-assistive-mml></mjx-container> 50% of test images against over 500 poison image insertions on MNIST, and nine insertions on CIFAR-10. These results establish new state-of-the-art provable defenses against general and label-flipping poison attacks. Code is available at <a href=\"https://github.com/alevine0/DPA\" target=\"_blank\" rel=\"nofollow\">https://github.com/alevine0/DPA</a>",
+    "title": "Deep Partition Aggregation: Provable Defenses against General Poisoning Attacks",
+    "authors": [
+      "Alexander Levine",
+      "Soheil Feizi"
+    ],
+    "emails": [
+      "~Alexander_Levine2",
+      "~Soheil_Feizi2"
+    ],
+    "rank": 630
+  },
+  {
+    "url": "https://openreview.net/forum?id=XPZIaotutsD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Recent progress in pre-trained neural language models has significantly improved the performance of many natural language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the disentangled attention mechanism, where each word is represented using two vectors that encode its content and position, respectively, and the attention weights among words are computed using disentangled matrices on their contents and relative positions, respectively. Second, an enhanced mask decoder is used to incorporate absolute positions in the decoding layer to predict the masked tokens in model pre-training. In addition, a new virtual adversarial training method is used for fine-tuning to improve models\u2019 \n generalization. We show that these techniques significantly improve the efficiency of model pre-training and the performance of both natural language understand(NLU) and natural langauge generation (NLG) downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). Notably, we scale up DeBERTa by training a larger version that consists of 48 Transform layers with 1.5 billion parameters. The significant performance boost makes the single DeBERTa model surpass the human performance on the SuperGLUE benchmark (Wang et al., 2019a) for the first time in terms of macro-average score (89.9 versus 89.8), and the ensemble DeBERTa model sits atop the SuperGLUE leaderboard as of January 6, 2021, outperforming the human baseline by a decent margin (90.3 versus\n89.8). The pre-trained DeBERTa models and the source code were released at: <a href=\"https://github.com/microsoft/DeBERTa\" target=\"_blank\" rel=\"nofollow\">https://github.com/microsoft/DeBERTa</a>.\n",
+    "title": "DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION",
+    "authors": [
+      "Pengcheng He",
+      "Xiaodong Liu",
+      "Jianfeng Gao",
+      "Weizhu Chen"
+    ],
+    "emails": [
+      "~Weizhu_Chen1",
+      "~Xiaodong_Liu1",
+      "~Jianfeng_Gao1",
+      "~Pengcheng_He2"
+    ],
+    "rank": 631
+  },
+  {
+    "url": "https://openreview.net/forum?id=XQQA6-So14",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a new class of parameterizations for spatio-temporal point processes which leverage Neural ODEs as a computational method and enable flexible, high-fidelity models of discrete events that are localized in continuous time and space. Central to our approach is a combination of continuous-time neural networks with two novel neural architectures, \\ie, Jump and Attentive Continuous-time Normalizing Flows. This approach allows us to learn complex distributions for both the spatial and temporal domain and to condition non-trivially on the observed event history. We validate our models on data sets from a wide variety of contexts such as seismology, epidemiology, urban mobility, and neuroscience.",
+    "title": "Neural Spatio-Temporal Point Processes",
+    "authors": [
+      "Ricky T. Q. Chen",
+      "Brandon Amos",
+      "Maximilian Nickel"
+    ],
+    "emails": [
+      "~Ricky_T._Q._Chen1",
+      "~Brandon_Amos1",
+      "~Maximilian_Nickel1"
+    ],
+    "rank": 632
+  },
+  {
+    "url": "https://openreview.net/forum?id=AICNpd8ke-m",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Post-hoc multi-class calibration is a common approach for providing high-quality confidence estimates of deep neural network predictions. Recent work has shown that widely used scaling methods underestimate their calibration error, while alternative Histogram Binning (HB) methods often fail to preserve classification accuracy. When classes have small prior probabilities, HB also faces the issue of severe sample-inefficiency after the conversion into K one-vs-rest class-wise calibration problems. The goal of this paper is to resolve the identified issues of HB in order to provide calibrated confidence estimates using only a small holdout calibration dataset for bin optimization while preserving multi-class ranking accuracy. From an information-theoretic perspective, we derive the I-Max concept for binning, which maximizes the mutual information between labels and quantized logits. This concept mitigates potential loss in ranking performance due to lossy quantization, and by disentangling the optimization of bin edges and representatives allows simultaneous improvement of ranking and calibration performance. To improve the sample efficiency and estimates from a small calibration set, we propose a shared class-wise (sCW) calibration strategy, sharing one calibrator among similar classes (e.g., with similar class priors) so that the training sets of their class-wise calibration problems can be merged to train the single calibrator. The combination of sCW and I-Max binning outperforms the state of the art calibration methods on various evaluation metrics across different benchmark datasets and models, using a small calibration set (e.g., 1k samples for ImageNet).",
+    "title": "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning",
+    "authors": [
+      "Kanil Patel",
+      "William H. Beluch",
+      "Bin Yang",
+      "Michael Pfeiffer",
+      "Dan Zhang"
+    ],
+    "emails": [
+      "~Kanil_Patel1",
+      "~Michael_Pfeiffer1",
+      "~Dan_Zhang1",
+      "~William_H._Beluch1",
+      "~Bin_Yang5"
+    ],
+    "rank": 633
+  },
+  {
+    "url": "https://openreview.net/forum?id=CNA6ZrpNDar",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      1,
+      3,
+      4
+    ],
+    "abstract": "This work tackles the problem of characterizing and understanding the decision boundaries of neural networks with piecewise linear non-linearity activations. We use tropical geometry, a new development in the area of algebraic geometry, to characterize the decision boundaries of a simple network of the form (Affine, ReLU, Affine). Our main finding is that the decision boundaries are a subset of a tropical hypersurface, which is intimately related to a polytope formed by the convex hull of two zonotopes. The generators of these zonotopes are functions of the network parameters. This geometric characterization provides new perspectives to three tasks. Specifically, we propose a new tropical perspective to the lottery ticket hypothesis, where we view the effect of different initializations on the tropical geometric representation of a network's decision boundaries. Moreover, we propose new tropical based optimization problems that directly influence the decision boundaries of the network for the tasks of network pruning (removing network parameters not contributing to the tropical geometric representation of the decision boundaries) and the generation of adversarial attacks.",
+    "title": "On the Decision Boundaries of Neural Networks. A Tropical Geometry Perspective",
+    "authors": [
+      "Motasem Alfarra",
+      "Adel Bibi",
+      "Hasan Abed Al Kader Hammoud",
+      "Mohamed Gaafar",
+      "Bernard Ghanem"
+    ],
+    "emails": [
+      "~Hasan_Abed_Al_Kader_Hammoud1",
+      "~Adel_Bibi1",
+      "~Mohamed_Gaafar1",
+      "~Bernard_Ghanem1",
+      "~Motasem_Alfarra1"
+    ],
+    "rank": 634
+  },
+  {
+    "url": "https://openreview.net/forum?id=PpshD0AXfA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      7,
+      5
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Generating synthetic time-series data is crucial in various application domains, such as medical prognosis, wherein research is hamstrung by the lack of access to data due to concerns over privacy. Most of the recently proposed methods for generating synthetic time-series rely on implicit likelihood modeling using generative adversarial networks (GANs)\u2014but such models can be difficult to train, and may jeopardize privacy by \u201cmemorizing\u201d temporal patterns in training data. In this paper, we propose an explicit likelihood model based on a novel class of normalizing flows that view time-series data in the frequency-domain rather than the time-domain. The proposed flow, dubbed a Fourier flow, uses a discrete Fourier transform (DFT) to convert variable-length time-series with arbitrary sampling periods into fixed-length spectral representations, then applies a (data-dependent) spectral filter to the frequency-transformed time-series. We show that, by virtue of the DFT analytic properties, the Jacobian determinants and inverse mapping for the Fourier flow can be computed efficiently in linearithmic time, without imposing explicit structural constraints as in existing flows such as NICE (Dinh et al. (2014)), RealNVP (Dinh et al. (2016)) and GLOW (Kingma &amp; Dhariwal (2018)). Experiments show that Fourier flows perform competitively compared to state-of-the-art baselines.",
+    "title": "Generative Time-series Modeling with Fourier Flows",
+    "authors": [
+      "Ahmed Alaa",
+      "Alex James Chan",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Ahmed_Alaa1",
+      "~Alex_James_Chan1",
+      "~Mihaela_van_der_Schaar2"
+    ],
+    "rank": 635
+  },
+  {
+    "url": "https://openreview.net/forum?id=5NsEIflpbSv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      5,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Training with soft targets instead of hard targets has been shown to improve performance and calibration of deep neural networks. Label smoothing is a popular way of computing soft targets, where one-hot encoding of a class is smoothed with a uniform distribution. Owing to its simplicity, label smoothing has found wide-spread use for training deep neural networks on a wide variety of tasks, ranging from image and text classification to machine translation and semantic parsing. Complementing recent empirical justification for label smoothing, we obtain PAC-Bayesian generalization bounds for label smoothing and show that the generalization error depends on the choice of the noise (smoothing) distribution. Then we propose low-rank adaptive label smoothing (LORAS): a simple yet novel method for training with learned soft targets that generalizes label smoothing and adapts to the latent structure of the label space in structured prediction tasks. Specifically, we evaluate our method on semantic parsing tasks and show that training with appropriately smoothed soft targets can significantly improve accuracy and model calibration, especially in low-resource settings. Used in conjunction with pre-trained sequence-to-sequence models, our method achieves state of the art performance on four semantic parsing data sets. LORAS can be used with any model, improves performance and implicit model calibration  without increasing the number of model parameters, and can be scaled to problems with large label spaces containing tens of thousands of labels.",
+    "title": "Learning Better Structured Representations Using Low-rank Adaptive Label Smoothing",
+    "authors": [
+      "Asish Ghoshal",
+      "Xilun Chen",
+      "Sonal Gupta",
+      "Luke Zettlemoyer",
+      "Yashar Mehdad"
+    ],
+    "emails": [
+      "~Xilun_Chen1",
+      "~Yashar_Mehdad2",
+      "~Asish_Ghoshal2",
+      "fb.com",
+      "~Luke_Zettlemoyer1"
+    ],
+    "rank": 636
+  },
+  {
+    "url": "https://openreview.net/forum?id=dNy_RKzJacY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We show how to assess a language model's knowledge of basic concepts of morality. We introduce the ETHICS dataset, a new benchmark that spans concepts in justice, well-being, duties, virtues, and commonsense morality. Models predict widespread moral judgments about diverse text scenarios. This requires connecting physical and social world knowledge to value judgements, a capability that may enable us to steer chatbot outputs or eventually regularize open-ended reinforcement learning agents. With the ETHICS dataset, we find that current language models have a promising but incomplete ability to predict basic human ethical judgements. Our work shows that progress can be made on machine ethics today, and it provides a steppingstone toward AI that is aligned with human values.",
+    "title": "Aligning AI With Shared Human Values",
+    "authors": [
+      "Dan Hendrycks",
+      "Collin Burns",
+      "Steven Basart",
+      "Andrew Critch",
+      "Jerry Li",
+      "Dawn Song",
+      "Jacob Steinhardt"
+    ],
+    "emails": [
+      "~Steven_Basart1",
+      "~Dawn_Song1",
+      "~Jacob_Steinhardt1",
+      "~Dan_Hendrycks1",
+      "columbia.edu",
+      "~Jerry_Li1",
+      "~Andrew_Critch1"
+    ],
+    "rank": 637
+  },
+  {
+    "url": "https://openreview.net/forum?id=lU5Rs_wCweN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "How to make unsupervised language pre-training more efficient and less resource-intensive is an important research direction in NLP. In this paper, we focus on improving the efficiency of language pre-training methods through providing better data utilization. It is well-known that in language data corpus, words follow a heavy-tail distribution. A large proportion of words appear only very few times and the embeddings of rare words are usually poorly optimized. We argue that such embeddings carry inadequate semantic signals, which could make the data utilization inefficient and slow down the pre-training of the entire model. To mitigate this problem, we propose Taking Notes on the Fly (TNF), which takes notes for rare words on the fly during pre-training to help the model understand them when they occur next time. Specifically, TNF maintains a note dictionary and saves a rare word's contextual information in it as notes when the rare word occurs in a sentence. When the same rare word occurs again during training, the note information saved beforehand can be employed to enhance the semantics of the current sentence. By doing so, TNF provides a better data utilization since cross-sentence information is employed to cover the inadequate semantics caused by rare words in the sentences. We implement TNF on both BERT and ELECTRA to check its efficiency and effectiveness.  Experimental results show that TNF's training time is 60% less than its backbone pre-training models when reaching the same performance.  When trained with same number of iterations, TNF outperforms its backbone methods on most of downstream tasks and the average GLUE score. Code is attached in the supplementary material.",
+    "title": "Taking Notes on the Fly Helps Language Pre-Training",
+    "authors": [
+      "Qiyu Wu",
+      "Chen Xing",
+      "Yatao Li",
+      "Guolin Ke",
+      "Di He",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Chen_Xing2",
+      "~Qiyu_Wu1",
+      "microsoft.com",
+      "~Tie-Yan_Liu1",
+      "~Guolin_Ke3",
+      "~Di_He1"
+    ],
+    "rank": 638
+  },
+  {
+    "url": "https://openreview.net/forum?id=djwS0m4Ft_A",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      8,
+      5
+    ],
+    "rating": "6.27",
+    "confidences": [
+      5,
+      1,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Learning disentangled representations is regarded as a fundamental task for improving the generalization, robustness, and interpretability of generative models. However, measuring disentanglement has been challenging and inconsistent, often dependent on an ad-hoc external model or specific to a certain dataset. To address this, we present a method for quantifying disentanglement that only uses the generative model, by measuring the topological similarity of conditional submanifolds in the learned representation. This method showcases both unsupervised and supervised variants. To illustrate the effectiveness and applicability of our method, we empirically evaluate several state-of-the-art models across multiple datasets. We find that our method ranks models similarly to existing methods. We make our code publicly available at <a href=\"https://github.com/stanfordmlgroup/disentanglement\" target=\"_blank\" rel=\"nofollow\">https://github.com/stanfordmlgroup/disentanglement</a>.",
+    "title": "Evaluating the Disentanglement of Deep Generative Models through Manifold Topology",
+    "authors": [
+      "Sharon Zhou",
+      "Eric Zelikman",
+      "Fred Lu",
+      "Andrew Y. Ng",
+      "Gunnar E. Carlsson",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Sharon_Zhou1",
+      "~Eric_Zelikman1",
+      "~Gunnar_E._Carlsson1",
+      "~Andrew_Y._Ng1",
+      "~Stefano_Ermon1"
+    ],
+    "rank": 639
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oj2hGyJwhwX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Normalization techniques are crucial in stabilizing and accelerating the training of deep neural networks. However, they are mainly designed for the independent and identically distributed (IID) data, not satisfying many real-world out-of-distribution (OOD) situations. Unlike most previous works, this paper presents two normalization methods, SelfNorm and CrossNorm, to promote OOD generalization. SelfNorm uses attention to recalibrate statistics (channel-wise mean and variance), while CrossNorm exchanges the statistics between feature maps. SelfNorm and CrossNorm can complement each other in OOD generalization, though exploring different directions in statistics usage. Extensive experiments on different domains (vision and language), tasks (classification and segmentation), and settings (supervised and semi-supervised) show their effectiveness.",
+    "title": "SelfNorm and CrossNorm for Out-of-Distribution Robustness",
+    "authors": [
+      "Zhiqiang Tang",
+      "Yunhe Gao",
+      "Yi Zhu",
+      "Zhi Zhang",
+      "Mu Li",
+      "Dimitris N. Metaxas"
+    ],
+    "emails": [
+      "~Yi_Zhu1",
+      "~Dimitris_N._Metaxas1",
+      "~Zhiqiang_Tang1",
+      "~Zhi_Zhang4",
+      "~Yunhe_Gao2",
+      "~Mu_Li1"
+    ],
+    "rank": 640
+  },
+  {
+    "url": "https://openreview.net/forum?id=d7KBjmI3GmQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      5,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a new test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more. To attain high accuracy on this test, models must possess extensive world knowledge and problem solving ability. We find that while most recent models have near random-chance accuracy, the very largest GPT-3 model improves over random chance by almost 20 percentage points on average. However, on every one of the 57 tasks, the best models still need substantial improvements before they can reach expert-level accuracy. Models also have lopsided performance and frequently do not know when they are wrong. Worse, they still have near-random accuracy on some socially important subjects such as morality and law. By comprehensively evaluating the breadth and depth of a model's academic and professional understanding, our test can be used to analyze models across many tasks and to identify important shortcomings.",
+    "title": "Measuring Massive Multitask Language Understanding",
+    "authors": [
+      "Dan Hendrycks",
+      "Collin Burns",
+      "Steven Basart",
+      "Andy Zou",
+      "Mantas Mazeika",
+      "Dawn Song",
+      "Jacob Steinhardt"
+    ],
+    "emails": [
+      "~Steven_Basart1",
+      "~Dawn_Song1",
+      "~Jacob_Steinhardt1",
+      "~Dan_Hendrycks1",
+      "columbia.edu",
+      "~Mantas_Mazeika3",
+      "berkeley.edu"
+    ],
+    "rank": 641
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mh1Abj33qI",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      8,
+      4
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Many popular graph neural network (GNN) architectures, which are often considered as the current state of the art, rely on encoding graph structure via smoothness or similarity between neighbors. While this approach performs well on a surprising number of standard benchmarks, the efficacy of such models does not translate consistently to more complex domains, such as graph data in the biochemistry domain. We argue that these more complex domains require priors that encourage learning of longer range features rather than oversmoothed signals of standard GNN architectures. Here, we propose an alternative GNN architecture, based on a relaxation of recently proposed geometric scattering transforms, which consists of a cascade of graph wavelet filters. Our learned geometric scattering (LEGS) architecture adaptively tunes these wavelets and their scales to encourage band-pass features to emerge in learned representations. This results in a simplified GNN with significantly fewer learned parameters compared to competing methods. We demonstrate the predictive performance of our method on several biochemistry graph classification benchmarks, as well as the descriptive quality of its learned features in biochemical graph data exploration tasks. Our results show that the proposed LEGS network matches or outperforms popular GNNs, as well as the original geometric scattering construction, while retaining certain mathematical properties of its handcrafted (nonlearned) design.",
+    "title": "Data-driven Learning of Geometric Scattering Networks",
+    "authors": [
+      "Alexander Tong",
+      "Frederik Wenkel",
+      "Kincaid Macdonald",
+      "Smita Krishnaswamy",
+      "Guy Wolf"
+    ],
+    "emails": [
+      "~Smita_Krishnaswamy1",
+      "yale.edu",
+      "~Alexander_Tong1",
+      "umontreal.ca",
+      "~Guy_Wolf1"
+    ],
+    "rank": 642
+  },
+  {
+    "url": "https://openreview.net/forum?id=pHXfe1cOmA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We propose HyperDynamics, a dynamics meta-learning framework that conditions on an agent\u2019s interactions with the environment and optionally its visual observations, and generates the parameters of neural dynamics models based on inferred properties of the dynamical system. Physical and visual properties of the environment that are not part of the low-dimensional state yet affect its temporal dynamics are inferred from the interaction history and visual observations, and are implicitly captured in the generated parameters. We test HyperDynamics on a set of object pushing and locomotion tasks. It outperforms existing dynamics models in the literature that adapt to environment variations by learning dynamics over high dimensional visual observations, capturing the interactions of the agent in recurrent state representations, or using gradient-based meta-optimization. We also show our method matches the performance of an ensemble of separately trained experts, while also being able to generalize well to unseen environment variations at test time. We attribute its good performance to the multiplicative interactions between the inferred system properties\u2014captured in the generated parameters\u2014and the low-dimensional state representation of the dynamical system.",
+    "title": "HyperDynamics: Meta-Learning Object and Agent Dynamics with Hypernetworks",
+    "authors": [
+      "Zhou Xian",
+      "Shamit Lal",
+      "Hsiao-Yu Tung",
+      "Emmanouil Antonios Platanios",
+      "Katerina Fragkiadaki"
+    ],
+    "emails": [
+      "~Katerina_Fragkiadaki1",
+      "~Zhou_Xian1",
+      "~Hsiao-Yu_Tung1",
+      "~Shamit_Lal1",
+      "~Emmanouil_Antonios_Platanios1"
+    ],
+    "rank": 643
+  },
+  {
+    "url": "https://openreview.net/forum?id=D1E1h-K3jso",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Learning from noisy data has attracted much attention, where most methods focus on label noise. In this work, we propose a new framework which simultaneously addresses three types of noise commonly seen in real-world data: label noise, out-of-distribution input, and input corruption. In contrast to most existing methods, we combat noise by learning robust representation. Specifically, we embed images into a low-dimensional subspace by training an autoencoder on the deep features. We regularize the geometric structure of the subspace with robust contrastive learning, which includes an unsupervised consistency loss and a supervised mixup prototypical loss. Furthermore, we leverage the structure of the learned subspace for noise cleaning, by aggregating information from neighboring samples. Experiments on multiple benchmarks demonstrate state-of-the-art performance of our method and robustness of the learned representation. Our code will be released.",
+    "title": "Learning from Noisy Data with Robust Representation Learning",
+    "authors": [
+      "Junnan Li",
+      "Caiming Xiong",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Caiming_Xiong1",
+      "~Junnan_Li2"
+    ],
+    "rank": 644
+  },
+  {
+    "url": "https://openreview.net/forum?id=PRr_3HPakQ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "To train a question answering model based on machine reading comprehension (MRC), significant effort is required to prepare annotated training data composed of questions and their answers from contexts. To mitigate this issue, recent research has focused on synthetically generating a question from a given context and an annotated (or generated) answer by training an additional generative model, which can be utilized to augment the training data. In light of this research direction, we propose a novel pre-training approach that learns to generate contextually rich questions, by recovering answer-containing sentences. Our approach is composed of two novel components, (1) dynamically determining K answers from a given document and (2) pre-training the question generator on the task of generating the answer-containing sentence. We evaluate our method against existing ones in terms of the quality of generated questions as well as the fine-tuned MRC model accuracy after training on the data synthetically generated by our method. Experimental results demonstrate that our approach consistently improves the question generation capability of existing models such as T5 and UniLM, and shows state-of-the-art results on MS MARCO and NewsQA, and comparable results to the state-of-the-art on SQuAD. Additionally, we demonstrate that the data synthetically generated by our approach is beneficial for boosting up the downstream MRC accuracy across a wide range of datasets, such as SQuAD-v1.1, v2.0, and KorQuAD, without any modification to the existing MRC models. Furthermore, our experiments highlight that our method shines especially when a limited amount of training data is given, in terms of both pre-training and downstream MRC data.",
+    "title": "Learning to Generate Questions by Recovering Answer-containing Sentences",
+    "authors": [
+      "Seohyun Back",
+      "Akhil Kedia",
+      "Sai Chetan Chinthakindi",
+      "Haejun Lee",
+      "Jaegul Choo"
+    ],
+    "emails": [
+      "~Sai_Chetan_Chinthakindi1",
+      "~Seohyun_Back1",
+      "~Akhil_Kedia1",
+      "~Jaegul_Choo1",
+      "samsung.com"
+    ],
+    "rank": 645
+  },
+  {
+    "url": "https://openreview.net/forum?id=sjuuTm4vj0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      5,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In recent years, Generative Adversarial Networks have become ubiquitous in both research and public perception, but how GANs convert an unstructured latent code to a high quality output is still an open question. In this work, we investigate regression into the latent space as a probe to understand the compositional properties of GANs.  We find that combining the regressor and a pretrained generator provides a strong image prior, allowing us to create composite images from a collage of random image parts at inference time while maintaining global consistency. To compare compositional properties across different generators, we measure the trade-offs between reconstruction of the unrealistic input and image quality of the regenerated samples. We find that the regression approach enables more localized editing of individual image parts compared to direct editing in the latent space, and we conduct experiments to quantify this independence effect. Our method is agnostic to the semantics of edits, and does not require labels or predefined concepts during training. Beyond image composition, our method extends to a number of related applications, such as image inpainting or example-based image editing, which we demonstrate on several GANs and datasets, and\nbecause it uses only a single forward pass, it can operate in real-time. Code is available on our project page: <a href=\"https://chail.github.io/latent-composition/\" target=\"_blank\" rel=\"nofollow\">https://chail.github.io/latent-composition/</a>.",
+    "title": "Using latent space regression to analyze and leverage compositionality in GANs",
+    "authors": [
+      "Lucy Chai",
+      "Jonas Wulff",
+      "Phillip Isola"
+    ],
+    "emails": [
+      "~Jonas_Wulff1",
+      "~Phillip_Isola1",
+      "~Lucy_Chai1"
+    ],
+    "rank": 646
+  },
+  {
+    "url": "https://openreview.net/forum?id=6FqKiVAdI3Y",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      9,
+      3,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Multi-agent policy gradient (MAPG) methods recently witness vigorous progress. However, there is a significant performance discrepancy between MAPG methods and state-of-the-art multi-agent value-based approaches. In this paper, we investigate causes that hinder the performance of MAPG algorithms and present a multi-agent decomposed policy gradient method (DOP). This method introduces the idea of value function decomposition into the multi-agent actor-critic framework. Based on this idea, DOP supports efficient off-policy learning and addresses the issue of centralized-decentralized mismatch and credit assignment in both discrete and continuous action spaces. We formally show that DOP critics have sufficient representational capability to guarantee convergence. In addition, empirical evaluations on the StarCraft II micromanagement benchmark and multi-agent particle environments demonstrate that DOP outperforms both state-of-the-art value-based and policy-based multi-agent reinforcement learning algorithms. Demonstrative videos are available at <a href=\"https://sites.google.com/view/dop-mapg/\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/dop-mapg/</a>.",
+    "title": "DOP: Off-Policy Multi-Agent Decomposed Policy Gradients",
+    "authors": [
+      "Yihan Wang",
+      "Beining Han",
+      "Tonghan Wang",
+      "Heng Dong",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "~Heng_Dong1",
+      "~Yihan_Wang1",
+      "~Beining_Han1",
+      "~Chongjie_Zhang1",
+      "~Tonghan_Wang1"
+    ],
+    "rank": 647
+  },
+  {
+    "url": "https://openreview.net/forum?id=kLbhLJ8OT12",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Designing task-oriented dialogue systems is a challenging research topic, since it needs not only to generate utterances fulfilling user requests but also to guarantee the comprehensibility. Many previous works trained end-to-end (E2E) models with supervised learning (SL), however, the bias in annotated system utterances remains as a bottleneck. Reinforcement learning (RL) deals with the problem through using non-differentiable evaluation metrics (e.g., the success rate) as rewards. Nonetheless, existing works with RL showed that the comprehensibility of generated system utterances could be corrupted when improving the performance on fulfilling user requests. In our work, we (1) propose modelling the hierarchical structure between dialogue policy and natural language generator (NLG) with the option framework, called HDNO, where the latent dialogue act is applied to avoid designing specific dialogue act representations; (2) train HDNO via hierarchical reinforcement learning (HRL), as well as suggest the asynchronous updates between dialogue policy and NLG during training to theoretically guarantee their convergence to a local maximizer; and (3) propose using a discriminator modelled with language models as an additional reward to further improve the comprehensibility. We test HDNO on MultiWoz 2.0 and MultiWoz 2.1, the datasets on multi-domain dialogues, in comparison with word-level E2E model trained with RL, LaRL and HDSA, showing improvements on the performance evaluated by automatic evaluation metrics and human evaluation. Finally, we demonstrate the semantic meanings of latent dialogue acts to show the explanability for HDNO.",
+    "title": "Modelling Hierarchical Structure between Dialogue Policy and Natural Language Generator with Option Framework for Task-oriented Dialogue System",
+    "authors": [
+      "Jianhong Wang",
+      "Yuan Zhang",
+      "Tae-Kyun Kim",
+      "Yunjie Gu"
+    ],
+    "emails": [
+      "imperial.ac.uk",
+      "~Yuan_Zhang7",
+      "~Jianhong_Wang1",
+      "~Tae-Kyun_Kim2"
+    ],
+    "rank": 648
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qun8fv4qSby",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      5,
+      7,
+      8
+    ],
+    "rating": "6.25",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Non-stationarity can arise in Reinforcement Learning (RL) even in stationary environments. For example, most RL algorithms collect new data throughout training, using a non-stationary behaviour policy. Due to the transience of this non-stationarity, it is often not explicitly addressed in deep RL and a single neural network is continually updated. However, we find evidence that neural networks exhibit a memory effect, where these transient non-stationarities can permanently impact the latent representation and adversely affect generalisation performance. Consequently, to improve generalisation of deep RL agents, we propose Iterated Relearning (ITER). ITER augments standard RL training by repeated knowledge transfer of the current policy into a freshly initialised network, which thereby experiences less non-stationarity during training. Experimentally, we show that ITER improves performance on the challenging generalisation benchmarks ProcGen and Multiroom.",
+    "title": "Transient Non-stationarity and Generalisation in Deep Reinforcement Learning",
+    "authors": [
+      "Maximilian Igl",
+      "Gregory Farquhar",
+      "Jelena Luketina",
+      "Wendelin Boehmer",
+      "Shimon Whiteson"
+    ],
+    "emails": [
+      "~Shimon_Whiteson1",
+      "~Maximilian_Igl1",
+      "~Jelena_Luketina1",
+      "~Wendelin_Boehmer1",
+      "~Gregory_Farquhar1"
+    ],
+    "rank": 649
+  },
+  {
+    "url": "https://openreview.net/forum?id=5NA1PinlGFu",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present the Colorization Transformer, a novel approach for diverse high fidelity image colorization based on self-attention. Given a grayscale image, the colorization proceeds in three steps. We first use a conditional autoregressive transformer to produce a low resolution coarse coloring of the grayscale image. Our architecture adopts conditional transformer layers to effectively condition grayscale input. Two subsequent fully parallel networks upsample the coarse colored low resolution image into a finely colored high resolution image. Sampling from the Colorization Transformer produces diverse colorings whose fidelity outperforms the previous state-of-the-art on colorising ImageNet based on FID results and based on a human evaluation in a Mechanical Turk test. Remarkably, in  more than 60\\% of cases human evaluators prefer the highest rated among three generated colorings over the ground truth. The code and pre-trained checkpoints for Colorization Transformer are publicly available  at <a href=\"https://github.com/google-research/google-research/tree/master/coltran\" target=\"_blank\" rel=\"nofollow\">https://github.com/google-research/google-research/tree/master/coltran</a>",
+    "title": "Colorization Transformer",
+    "authors": [
+      "Manoj Kumar",
+      "Dirk Weissenborn",
+      "Nal Kalchbrenner"
+    ],
+    "emails": [
+      "~Nal_Kalchbrenner1",
+      "~Manoj_Kumar1",
+      "~Dirk_Weissenborn1"
+    ],
+    "rank": 650
+  },
+  {
+    "url": "https://openreview.net/forum?id=XAS3uKeFWj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We solve the problem of 6-DoF localisation and 3D dense reconstruction in spatial environments as approximate Bayesian inference in a deep state-space model. Our approach leverages both learning and domain knowledge from multiple-view geometry and rigid-body dynamics. This results in an expressive predictive model of the world, often missing in current state-of-the-art visual SLAM solutions. The combination of variational inference, neural networks and a differentiable raycaster ensures that our model is amenable to end-to-end gradient-based optimisation. We evaluate our approach on realistic unmanned aerial vehicle flight data, nearing the performance of state-of-the-art visual-inertial odometry systems. We demonstrate the applicability of the model to generative prediction and planning.",
+    "title": "Variational State-Space Models for Localisation and Dense 3D Mapping in 6 DoF",
+    "authors": [
+      "Atanas Mirchev",
+      "Baris Kayalibay",
+      "Patrick van der Smagt",
+      "Justin Bayer"
+    ],
+    "emails": [
+      "~Baris_Kayalibay1",
+      "~Justin_Bayer1",
+      "~Patrick_van_der_Smagt1",
+      "~Atanas_Mirchev1"
+    ],
+    "rank": 651
+  },
+  {
+    "url": "https://openreview.net/forum?id=F8whUO8HNbP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Training on synthetic data can be beneficial for label or data-scarce scenarios. However, synthetically trained models often suffer from poor generalization in real domains due to domain gaps. In this work, we make a key observation that the diversity of the learned feature embeddings plays an important role in the generalization performance. To this end, we propose contrastive synthetic-to-real generalization (CSG), a novel framework that leverage the pre-trained ImageNet knowledge to prevent overfitting to the synthetic domain, while promoting the diversity of feature embeddings as an inductive bias to improve generalization. In addition, we enhance the proposed CSG framework with attentional pooling (A-pool) to let the model focus on semantically important regions and further improve its generalization. We demonstrate the effectiveness of CSG on various synthetic training tasks, exhibiting state-of-the-art performance on zero-shot domain generalization.",
+    "title": "Contrastive Syn-to-Real Generalization",
+    "authors": [
+      "Wuyang Chen",
+      "Zhiding Yu",
+      "Shalini De Mello",
+      "Sifei Liu",
+      "Jose M. Alvarez",
+      "Zhangyang Wang",
+      "Anima Anandkumar"
+    ],
+    "emails": [
+      "~Shalini_De_Mello1",
+      "~Wuyang_Chen1",
+      "~Zhangyang_Wang1",
+      "~Sifei_Liu2",
+      "~Zhiding_Yu1",
+      "~Anima_Anandkumar1",
+      "~Jose_M._Alvarez2"
+    ],
+    "rank": 652
+  },
+  {
+    "url": "https://openreview.net/forum?id=KxUlUb26-P3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      8,
+      5
+    ],
+    "rating": "6.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Real-world applications often require making use of {\\em a range of incidental supervision signals}. However, we currently lack a principled way to measure the benefit an incidental training dataset can bring, and the common practice of using indirect, weaker signals is through exhaustive experiments with various models and hyper-parameters. This paper studies whether we can, {\\em in a single framework, quantify the benefit of various types of incidental signals for one's target task without going through combinatorial experiments}. We propose PABI, a unified informativeness measure motivated by PAC-Bayesian theory, characterizing the reduction in uncertainty that indirect, weak signals provide. We demonstrate PABI's use in quantifying various types of incidental signals including partial labels, noisy labels, constraints, cross-domain signals, and combinations of these. Experiments with various setups on two natural language processing (NLP) tasks, named entity recognition (NER) and question answering (QA), show that PABI correlates well with learning performance, providing a promising way to determine, ahead of learning, which supervision signals would be beneficial.",
+    "title": "PABI: A Unified PAC-Bayesian Informativeness Measure for Incidental Supervision Signals",
+    "authors": [
+      "Hangfeng He",
+      "Mingyuan Zhang",
+      "Qiang Ning",
+      "Dan Roth"
+    ],
+    "emails": [
+      "seas.upenn.edu",
+      "~Hangfeng_He3",
+      "~Dan_Roth3",
+      "amazon.com"
+    ],
+    "rank": 653
+  },
+  {
+    "url": "https://openreview.net/forum?id=OMNB1G5xzd4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      5,
+      5,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Offline learning is a key part of making reinforcement learning (RL) useable in real systems. Offline RL looks at scenarios where there is data from a system's operation, but no direct access to the system when learning a policy. Recent work on training RL policies from offline data has shown results both with model-free policies learned directly from the data, or with planning on top of learnt models of the data. Model-free policies tend to be more performant, but are more opaque, harder to command externally, and less easy to integrate into larger systems. We propose an offline learner that generates a model that can be used to control the system directly through planning. This allows us to have easily controllable policies directly from data, without ever interacting with the system. We show the performance of our algorithm, Model-Based Offline Planning (MBOP) on a series of robotics-inspired tasks, and demonstrate its ability leverage planning to respect environmental constraints. We are able to find near-optimal polices for certain simulated systems from as little as 50 seconds of real-time system interaction, and create zero-shot goal-conditioned policies on a series of environments.",
+    "title": "Model-Based Offline Planning",
+    "authors": [
+      "Arthur Argenson",
+      "Gabriel Dulac-Arnold"
+    ],
+    "emails": [
+      "~Gabriel_Dulac-Arnold1",
+      "google.com"
+    ],
+    "rank": 654
+  },
+  {
+    "url": "https://openreview.net/forum?id=VqzVhqxkjH1",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      3,
+      3,
+      2
+    ],
+    "abstract": "In Machine Learning as a Service, a provider trains a deep neural network and gives many users access. The hosted (source) model is susceptible to model stealing attacks, where an adversary derives a surrogate model from API access to the source model. For post hoc detection of such attacks, the provider needs a robust method to determine whether a suspect model is a surrogate of their model. We propose a fingerprinting method for deep neural network classifiers that extracts a set of inputs from the source model so that only surrogates agree with the source model on the classification of such inputs. These inputs are a subclass of transferable adversarial examples which we call conferrable adversarial examples that exclusively transfer with a target label from a source model to its surrogates. We propose a new method to generate these conferrable adversarial examples. We present an extensive study on the irremovability of our fingerprint against fine-tuning, weight pruning, retraining, retraining with different architectures, three model extraction attacks from related work, transfer learning, adversarial training, and two new adaptive attacks. Our fingerprint is robust against distillation, related model extraction attacks, and even transfer learning when the attacker has no access to the model provider's dataset. Our fingerprint is the first method that reaches a ROC AUC of 1.0 in verifying surrogates, compared to a ROC AUC of 0.63 by previous fingerprints. ",
+    "title": "Deep Neural Network Fingerprinting by Conferrable Adversarial Examples",
+    "authors": [
+      "Nils Lukas",
+      "Yuxuan Zhang",
+      "Florian Kerschbaum"
+    ],
+    "emails": [
+      "~Nils_Lukas1",
+      "~Yuxuan_Zhang1",
+      "~Florian_Kerschbaum1"
+    ],
+    "rank": 655
+  },
+  {
+    "url": "https://openreview.net/forum?id=MDsQkFP1Aw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent progress in deep learning has enabled many advances in sound separation and visual scene understanding. However, extracting sound sources which are apparent in natural videos remains an open problem. In this work, we present AudioScope, a novel audio-visual sound separation framework that can be trained without supervision to isolate on-screen sound sources from real in-the-wild videos. Prior audio-visual separation work assumed artificial limitations on the domain of sound classes (e.g., to speech or music), constrained the number of sources, and required strong sound separation or visual segmentation labels. AudioScope overcomes these limitations, operating on an open domain of sounds, with variable numbers of sources, and without labels or prior visual segmentation.  The training procedure for AudioScope uses mixture invariant training (MixIT) to separate synthetic mixtures of mixtures (MoMs) into individual sources, where noisy labels for mixtures are provided by an unsupervised audio-visual coincidence model. Using the noisy labels, along with attention between video and audio features, AudioScope learns to identify audio-visual similarity and to suppress off-screen sounds. We demonstrate the effectiveness of our approach using a dataset of video clips extracted from open-domain YFCC100m video data. This dataset contains a wide diversity of sound classes recorded in unconstrained conditions, making the application of previous methods unsuitable. For evaluation and semi-supervised experiments, we collected human labels for presence of on-screen and off-screen sounds on a small subset of clips.",
+    "title": "Into the Wild with AudioScope: Unsupervised Audio-Visual Separation of On-Screen Sounds",
+    "authors": [
+      "Efthymios Tzinis",
+      "Scott Wisdom",
+      "Aren Jansen",
+      "Shawn Hershey",
+      "Tal Remez",
+      "Dan Ellis",
+      "John R. Hershey"
+    ],
+    "emails": [
+      "~Scott_Wisdom1",
+      "~Efthymios_Tzinis1",
+      "~Dan_Ellis1",
+      "google.com",
+      "~John_R._Hershey1"
+    ],
+    "rank": 656
+  },
+  {
+    "url": "https://openreview.net/forum?id=j1RMMKeP2gR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      8
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The standard Markov Decision Process (MDP) formulation hinges on the assumption that an action is executed immediately after it was chosen. However, assuming it is often unrealistic and can lead to catastrophic failures in applications such as robotic manipulation, cloud computing, and finance. We introduce a framework for learning and planning in MDPs where the decision-maker commits actions that are executed with a delay of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container> steps. The brute-force state augmentation baseline where the state is concatenated to the last <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container> committed actions suffers from an exponential complexity in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>, as we show for policy iteration. We then prove that with execution delay, deterministic Markov policies in the original state-space are sufficient for attaining maximal reward, but need to be non-stationary. As for stationary Markov policies, we show they are sub-optimal in general. Consequently, we devise a non-stationary Q-learning style model-based algorithm that solves delayed execution tasks without resorting to state-augmentation. Experiments on tabular, physical, and Atari domains reveal that it converges quickly to high performance even for substantial delays, while standard approaches that either ignore the delay or rely on state-augmentation struggle or fail due to divergence. The code is available at \\url{<a href=\"https://github.com/galdl/rl_delay_basic.git}\" target=\"_blank\" rel=\"nofollow\">https://github.com/galdl/rl_delay_basic.git}</a>.",
+    "title": "Acting in Delayed Environments with Non-Stationary Markov Policies",
+    "authors": [
+      "Esther Derman",
+      "Gal Dalal",
+      "Shie Mannor"
+    ],
+    "emails": [
+      "~Esther_Derman1",
+      "~Gal_Dalal2",
+      "~Shie_Mannor2"
+    ],
+    "rank": 657
+  },
+  {
+    "url": "https://openreview.net/forum?id=IFqrg1p5Bc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We investigate approaches to regularisation during fine-tuning of deep neural networks. First we provide a neural network generalisation bound based on Rademacher complexity that uses the distance the weights have moved from their initial values. This bound has no direct dependence on the number of weights and compares favourably to other bounds when applied to convolutional networks. Our bound is highly relevant for fine-tuning, because providing a network with a good initialisation based on transfer learning means that learning can modify the weights less, and hence achieve tighter generalisation. Inspired by this, we develop a simple yet effective fine-tuning algorithm that constrains the hypothesis class to a small sphere centred on the initial pre-trained weights, thus obtaining provably better generalisation performance than conventional transfer learning. Empirical evaluation shows that our algorithm works well, corroborating our theoretical results. It outperforms both state of the art fine-tuning competitors, and penalty-based alternatives that we show do not directly constrain the radius of the search space.",
+    "title": "Distance-Based Regularisation of Deep Networks for Fine-Tuning",
+    "authors": [
+      "Henry Gouk",
+      "Timothy Hospedales",
+      "massimiliano pontil"
+    ],
+    "emails": [
+      "~Henry_Gouk1",
+      "~massimiliano_pontil1",
+      "~Timothy_Hospedales1"
+    ],
+    "rank": 658
+  },
+  {
+    "url": "https://openreview.net/forum?id=iOnhIy-a-0n",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Replica exchange stochastic gradient Langevin dynamics (reSGLD) has shown promise in accelerating the convergence in non-convex learning; however, an excessively large correction for avoiding biases from noisy energy estimators has limited the potential of the acceleration. To address this issue, we study the variance reduction for noisy energy estimators, which promotes much more effective swaps. Theoretically, we provide a non-asymptotic analysis on the exponential convergence for the underlying continuous-time Markov jump process; moreover, we consider a generalized Girsanov theorem which includes the change of Poisson measure to overcome the crude discretization based on the Gr\\\"{o}wall's inequality and yields a much tighter error in the 2-Wasserstein (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c57 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">W</mi></mrow><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>) distance. Numerically, we conduct extensive experiments and obtain state-of-the-art results in optimization and uncertainty estimates for synthetic experiments and image data.",
+    "title": "Accelerating Convergence of Replica Exchange Stochastic Gradient MCMC via Variance Reduction",
+    "authors": [
+      "Wei Deng",
+      "Qi Feng",
+      "Georgios P. Karagiannis",
+      "Guang Lin",
+      "Faming Liang"
+    ],
+    "emails": [
+      "~Faming_Liang1",
+      "durham.ac.uk",
+      "~Guang_Lin1",
+      "usc.edu",
+      "~Wei_Deng1"
+    ],
+    "rank": 659
+  },
+  {
+    "url": "https://openreview.net/forum?id=4IwieFS44l",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The efficient and accurate characterization of the robustness of neural networks to input perturbation is an important open problem. Many approaches exist including heuristic and exact (or complete) methods. Complete methods are expensive but their mathematical formulation guarantees that they provide exact robustness metrics. However, this guarantee is valid only if we assume that the verified network applies arbitrary-precision arithmetic and the verifier is reliable. In practice, however, both the networks and the verifiers apply limited-precision floating point arithmetic. In this paper, we show that numerical roundoff errors can be exploited to craft adversarial networks, in which the actual robustness and the robustness computed by a state-of-the-art complete verifier radically differ. We also show that such adversarial networks can be used to insert a backdoor into any network in such a way that the backdoor is completely missed by the verifier. The attack is easy to detect in its naive form but, as we show, the adversarial network can be transformed to make its detection less trivial. We offer a simple defense against our particular attack based on adding a very small perturbation to the network weights. However, our conjecture is that other numerical attacks are possible, and exact verification has to take into account all the details of the computation executed by the verified networks, which makes the problem significantly harder.\n\n",
+    "title": "Fooling a Complete Neural Network Verifier",
+    "authors": [
+      "D\u00e1niel Zombori",
+      "Bal\u00e1zs B\u00e1nhelyi",
+      "Tibor Csendes",
+      "Istv\u00e1n Megyeri",
+      "M\u00e1rk Jelasity"
+    ],
+    "emails": [
+      "inf.u-szeged.hu",
+      "~M\u00e1rk_Jelasity1"
+    ],
+    "rank": 660
+  },
+  {
+    "url": "https://openreview.net/forum?id=ujmgfuxSLrO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We introduce a deep and light-weight transformer, DeLighT, that delivers similar or better performance than standard transformer-based models with significantly fewer parameters. DeLighT more efficiently allocates parameters both (1) within each Transformer block using the DeLighT transformation, a deep and light-weight transformation and (2) across blocks using block-wise scaling, that allows for shallower and narrower DeLighT blocks near the input and wider and deeper DeLighT blocks near the output. Overall, DeLighT networks are 2.5 to 4 times deeper than standard transformer models and yet have fewer parameters and operations. Experiments on benchmark machine translation and language modeling tasks show that DeLighT matches or improves the performance of baseline Transformers with 2 to 3 times fewer parameters on average. ",
+    "title": "DeLighT: Deep and Light-weight Transformer",
+    "authors": [
+      "Sachin Mehta",
+      "Marjan Ghazvininejad",
+      "Srinivasan Iyer",
+      "Luke Zettlemoyer",
+      "Hannaneh Hajishirzi"
+    ],
+    "emails": [
+      "~Marjan_Ghazvininejad1",
+      "fb.com",
+      "~Sachin_Mehta1",
+      "~Hannaneh_Hajishirzi1",
+      "~Luke_Zettlemoyer1"
+    ],
+    "rank": 661
+  },
+  {
+    "url": "https://openreview.net/forum?id=DNl5s5BXeBn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Training classifiers under fairness constraints such as group fairness, regularizes the disparities of predictions between the groups. Nevertheless, even though the constraints are satisfied during training, they might not generalize at evaluation time. To improve the generalizability of fair classifiers, we propose fair mixup, a new data augmentation strategy for imposing the fairness constraint. In particular, we show that fairness can be achieved by regularizing the models on paths of interpolated samples  between the groups. We use mixup, a powerful data augmentation strategy  to generate these interpolates. We analyze fair mixup and empirically show that it ensures a better generalization for both accuracy and fairness measurement in tabular, vision, and language benchmarks.",
+    "title": "Fair Mixup: Fairness via Interpolation",
+    "authors": [
+      "Ching-Yao Chuang",
+      "Youssef Mroueh"
+    ],
+    "emails": [
+      "~Youssef_Mroueh1",
+      "~Ching-Yao_Chuang1"
+    ],
+    "rank": 662
+  },
+  {
+    "url": "https://openreview.net/forum?id=tlV90jvZbw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      4,
+      7
+    ],
+    "rating": "6.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Over-parameterized models, such as large deep networks, often exhibit a double descent phenomenon, whereas a function of model size, error first decreases, increases, and decreases at last. This intriguing double descent behavior also occurs as a function of training epochs and has been conjectured to arise because training epochs control the model complexity. In this paper, we show that such epoch-wise double descent occurs for a different reason: It is caused by a superposition of two or more bias-variance tradeoffs that arise because different parts of the network are learned at different epochs, and mitigating this by proper scaling of stepsizes can significantly improve the early stopping performance. We show this analytically for i) linear regression, where differently scaled features give rise to a superposition of bias-variance tradeoffs, and for ii) a wide two-layer neural network, where the first and second layers govern bias-variance tradeoffs. Inspired by this theory, we study two standard convolutional networks empirically and show that eliminating epoch-wise double descent through adjusting stepsizes of different layers improves the early stopping performance.",
+    "title": "Early Stopping in Deep Networks: Double Descent and How to Eliminate it",
+    "authors": [
+      "Reinhard Heckel",
+      "Fatih Furkan Yilmaz"
+    ],
+    "emails": [
+      "~Reinhard_Heckel1",
+      "~Fatih_Furkan_Yilmaz1"
+    ],
+    "rank": 663
+  },
+  {
+    "url": "https://openreview.net/forum?id=45uOPa46Kh",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      8,
+      4,
+      5
+    ],
+    "rating": "6.25",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Vision-and-language navigation (VLN) is a task in which an agent is embodied in a realistic 3D environment and follows an instruction to reach the goal node. While most of the previous studies have built and investigated a discriminative approach, we notice that there are in fact two possible approaches to building such a VLN agent: discriminative and generative. In this paper, we design and investigate a generative language-grounded policy which uses a language model to compute the distribution over all possible instructions i.e. all possible sequences of vocabulary tokens given action and the transition history. In experiments, we show that the proposed generative approach outperforms the discriminative approach in the Room-2-Room (R2R) and Room-4-Room (R4R) datasets, especially in the unseen environments. We further show that the combination of the generative and discriminative policies achieves close to the state-of-the art results in the R2R dataset, demonstrating that the generative and discriminative policies capture the different aspects of VLN.",
+    "title": "Generative Language-Grounded Policy in Vision-and-Language Navigation with Bayes' Rule",
+    "authors": [
+      "Shuhei Kurita",
+      "Kyunghyun Cho"
+    ],
+    "emails": [
+      "~Kyunghyun_Cho1",
+      "~Shuhei_Kurita1"
+    ],
+    "rank": 664
+  },
+  {
+    "url": "https://openreview.net/forum?id=K5a_QFEUzA1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.24",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Recent unsupervised machine translation (UMT) systems usually employ three main principles: initialization, language modeling and iterative back-translation, though they may apply them differently. Crucially, iterative back-translation and denoising auto-encoding for language modeling provide data diversity to train the UMT systems. However, these diversification processes may have reached their limit. We introduce a novel component to the standard UMT framework called Cross-model Back-translated Distillation (CBD), that is aimed to induce another level of data diversification that existing principles lack. CBD is applicable to all previous UMT approaches. In our experiments, it boosts the performance of the standard UMT methods by 1.5-2.0 BLEU. In particular, in WMT'14 English-French, WMT'16 German-English and English-Romanian, CBD outperforms cross-lingual masked language model (XLM) by 2.3, 2.2 and 1.6 BLEU, respectively. It also yields 1.5-3.3 BLEU improvements in IWSLT English-French and English-German tasks. Through extensive experimental analyses, we show that CBD is effective because it embraces data diversity while other similar variants do not.",
+    "title": "Cross-model Back-translated Distillation for Unsupervised Machine Translation",
+    "authors": [
+      "Phi Xuan Nguyen",
+      "Shafiq Joty",
+      "Kui Wu",
+      "AiTi Aw"
+    ],
+    "emails": [
+      "i2r.a-star.edu.sg",
+      "~Shafiq_Joty1",
+      "~Phi_Xuan_Nguyen1",
+      "~AiTi_Aw1"
+    ],
+    "rank": 665
+  },
+  {
+    "url": "https://openreview.net/forum?id=LkFG3lB13U5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.24",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Federated learning is a distributed machine learning paradigm in which a large number of clients coordinate with a central server to learn a model without sharing their own training data. Standard federated optimization methods such as Federated Averaging (FedAvg) are often difficult to tune and exhibit unfavorable convergence behavior. In non-federated settings, adaptive optimization methods have had notable success in combating such issues. In this work, we propose federated versions of adaptive optimizers, including Adagrad, Adam, and  Yogi, and analyze their convergence in the presence of heterogeneous data for general non-convex settings. Our results highlight the interplay between client heterogeneity and communication efficiency. We also perform extensive experiments on these methods and show that the use of adaptive optimizers can significantly improve the performance of federated learning.",
+    "title": "Adaptive Federated Optimization",
+    "authors": [
+      "Sashank J. Reddi",
+      "Zachary Charles",
+      "Manzil Zaheer",
+      "Zachary Garrett",
+      "Keith Rush",
+      "Jakub Kone\u010dn\u00fd",
+      "Sanjiv Kumar",
+      "Hugh Brendan McMahan"
+    ],
+    "emails": [
+      "~Manzil_Zaheer1",
+      "~Hugh_Brendan_McMahan1",
+      "~Zachary_Charles1",
+      "~Jakub_Kone\u010dn\u00fd1",
+      "google.com",
+      "~Sanjiv_Kumar1",
+      "~Sashank_J._Reddi1"
+    ],
+    "rank": 666
+  },
+  {
+    "url": "https://openreview.net/forum?id=XOjv2HxIF6i",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.24",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Several recently proposed unsupervised meta-learning approaches rely on synthetic meta-tasks created using techniques such as random selection, clustering and/or augmentation. In this work, we describe a novel approach that generates meta-tasks using generative models. The proposed family of algorithms generate pairs of in-class and out-of-class samples from the latent space in a principled way, allowing us to create synthetic classes forming the training and validation data of a meta-task. We find that the proposed approach, LAtent Space Interpolation Unsupervised Meta-learning (LASIUM), outperforms or is competitive with current unsupervised learning baselines on few-shot classification tasks on the most widely used benchmark datasets. ",
+    "title": "Unsupervised Meta-Learning through Latent-Space Interpolation in Generative Models",
+    "authors": [
+      "Siavash Khodadadeh",
+      "Sharare Zehtabian",
+      "Saeed Vahidian",
+      "Weijia Wang",
+      "Bill Lin",
+      "Ladislau Boloni"
+    ],
+    "emails": [
+      "eng.ucsd.edu",
+      "~Bill_Lin1",
+      "~Siavash_Khodadadeh1",
+      "knights.ucf.edu",
+      "~Saeed_Vahidian1",
+      "~Ladislau_Boloni1"
+    ],
+    "rank": 667
+  },
+  {
+    "url": "https://openreview.net/forum?id=fSTD6NFIW_b",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Empirical studies suggest that machine learning models often rely on features, such as the background, that may be spuriously correlated with the label only during training time, resulting in poor accuracy during test-time. In this work, we identify the fundamental factors that give rise to this behavior, by explaining why models fail this way even in easy-to-learn tasks where one would expect these models to succeed. In particular, through a theoretical study of gradient-descent-trained linear classifiers on some easy-to-learn tasks, we uncover two complementary failure modes. These modes arise from how spurious correlations induce two kinds of skews in the data: one geometric in nature and another, statistical. Finally, we construct natural modifications of image classification datasets to understand when these failure modes can arise in practice. We also design experiments to isolate the two failure modes when training modern neural networks on these datasets.",
+    "title": "Understanding the failure modes of out-of-distribution generalization",
+    "authors": [
+      "Vaishnavh Nagarajan",
+      "Anders Andreassen",
+      "Behnam Neyshabur"
+    ],
+    "emails": [
+      "google.com",
+      "~Vaishnavh_Nagarajan3",
+      "~Behnam_Neyshabur1"
+    ],
+    "rank": 668
+  },
+  {
+    "url": "https://openreview.net/forum?id=ufZN2-aehFa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.23",
+    "confidences": [
+      1,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Formulating scalable probabilistic regression models with reliable uncertainty estimates has been a long-standing challenge in machine learning research.  Recently, casting probabilistic regression as a multi-task learning problem in terms of conditional latent variable (CLV) models such as the Neural Process (NP) has shown promising results. In this paper, we focus on context aggregation, a central component of such architectures, which fuses information from multiple context data points. So far, this aggregation operation has been treated separately from the inference of a latent representation of the target function in CLV models. Our key contribution is to combine these steps into one holistic mechanism by phrasing context aggregation as a Bayesian inference problem. The resulting Bayesian Aggregation (BA) mechanism enables principled handling of task ambiguity, which is key for efficiently processing context information. We demonstrate on a range of challenging experiments that BA consistently improves upon the performance of traditional mean aggregation while remaining computationally efficient and fully compatible with existing NP-based models.",
+    "title": "Bayesian Context Aggregation for Neural Processes",
+    "authors": [
+      "Michael Volpp",
+      "Fabian Fl\u00fcrenbrock",
+      "Lukas Grossberger",
+      "Christian Daniel",
+      "Gerhard Neumann"
+    ],
+    "emails": [
+      "~Fabian_Fl\u00fcrenbrock1",
+      "~Christian_Daniel1",
+      "~Michael_Volpp1",
+      "~Gerhard_Neumann1",
+      "~Lukas_Grossberger1"
+    ],
+    "rank": 669
+  },
+  {
+    "url": "https://openreview.net/forum?id=NfZ6g2OmXEk",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7,
+      6
+    ],
+    "rating": "6.23",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Simulated environments with procedurally generated content have become popular benchmarks for testing systematic generalization of reinforcement learning agents. Every level in such an environment is algorithmically created, thereby exhibiting a unique configuration of underlying factors of variation, such as layout, positions of entities, asset appearances, or even the rules governing environment transitions. Fixed sets of training levels can be determined to aid comparison and reproducibility, and test levels can be held out to evaluate the generalization and robustness of agents. While prior work samples training levels in a direct way (e.g.~uniformly) for the agent to learn from, we investigate the hypothesis that different levels provide different learning progress for an agent at specific times during training. We introduce Prioritized Level Replay, a general framework for estimating the future learning potential of a level given the current state of the agent's policy. We find that temporal-difference (TD) errors, while previously used to selectively sample past transitions, also prove effective for scoring a level's future learning potential when the agent replays (that is, revisits) that level to generate entirely new episodes of experiences from it. We report significantly improved sample-efficiency and generalization on the majority of Procgen Benchmark environments as well as two challenging MiniGrid environments. Lastly, we present a qualitative analysis showing that Prioritized Level Replay induces an implicit curriculum, taking the agent gradually from easier to harder levels.",
+    "title": "Prioritized Level Replay",
+    "authors": [
+      "Minqi Jiang",
+      "Edward Grefenstette",
+      "Tim Rockt\u00e4schel"
+    ],
+    "emails": [
+      "~Tim_Rockt\u00e4schel1",
+      "~Minqi_Jiang1",
+      "~Edward_Grefenstette1"
+    ],
+    "rank": 670
+  },
+  {
+    "url": "https://openreview.net/forum?id=OOsR8BzCnl5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      8
+    ],
+    "rating": "6.23",
+    "confidences": [
+      3,
+      5,
+      5
+    ],
+    "abstract": "Multi-view classification (MVC) generally focuses on improving classification accuracy by using information from different views, typically integrating them into a unified comprehensive representation for downstream tasks. However, it is also crucial to dynamically assess the quality of a view for different samples in order to provide reliable uncertainty estimations, which indicate whether predictions can be trusted. To this end, we propose a novel multi-view classification method, termed trusted multi-view classification, which provides a new paradigm for multi-view learning by dynamically integrating different views at an evidence level. The algorithm jointly utilizes multiple views to promote both classification reliability (uncertainty estimation during testing) and robustness (out-of-distribution-awareness during training) by integrating evidence from each view. To achieve this, the Dirichlet distribution is used to model the distribution of the class probabilities, parameterized with evidence from different views and integrated with the Dempster-Shafer theory. The unified learning framework induces accurate uncertainty and accordingly endows the model with both reliability and robustness for out-of-distribution samples. Extensive experimental results validate the effectiveness of the proposed model in accuracy, reliability and robustness.",
+    "title": "Trusted Multi-View Classification",
+    "authors": [
+      "Zongbo Han",
+      "Changqing Zhang",
+      "Huazhu Fu",
+      "Joey Tianyi Zhou"
+    ],
+    "emails": [
+      "~Zongbo_Han1",
+      "~Joey_Tianyi_Zhou1",
+      "~Huazhu_Fu4",
+      "~Changqing_Zhang1"
+    ],
+    "rank": 671
+  },
+  {
+    "url": "https://openreview.net/forum?id=USCNapootw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.23",
+    "confidences": [
+      5,
+      3,
+      5
+    ],
+    "abstract": "A core challenge with existing certified defense mechanisms is that while they improve certified robustness, they also tend to drastically decrease natural accuracy, making it difficult to use these methods in practice. In this work, we propose a new architecture which addresses this challenge and enables one to boost the certified robustness of any state-of-the-art deep network, while controlling the overall accuracy loss, without requiring retraining. The key idea is to combine this model with a (smaller) certified network where at inference time, an adaptive selection mechanism decides on the network to process the input sample. The approach is compositional: one can combine any pair of state-of-the-art (e.g., EfficientNet or ResNet) and certified networks, without restriction. The resulting architecture enables much higher natural accuracy than previously possible with certified defenses alone, while substantially boosting the certified robustness of deep networks. We demonstrate the effectiveness of this adaptive approach on a variety of datasets and architectures.  For instance, on CIFAR-10 with an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> perturbation of 2/255, we are the first to obtain a high natural accuracy (90.1%) with non-trivial certified robustness (27.5%). Notably, prior state-of-the-art methods incur a substantial drop in accuracy for a similar certified robustness.",
+    "title": "Certify or Predict: Boosting Certified Robustness with Compositional Architectures",
+    "authors": [
+      "Mark Niklas Mueller",
+      "Mislav Balunovic",
+      "Martin Vechev"
+    ],
+    "emails": [
+      "~Mark_Niklas_Mueller2",
+      "~Martin_Vechev1",
+      "~Mislav_Balunovic1"
+    ],
+    "rank": 672
+  },
+  {
+    "url": "https://openreview.net/forum?id=ehJqJQk9cw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.23",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "While federated learning traditionally aims to train a single global model across decentralized local datasets, one model may not always be ideal for all participating clients. Here we propose an alternative, where each client only federates with other relevant clients to obtain a stronger model per client-specific objectives. To achieve this personalization, rather than computing a single model average with constant weights for the entire federation as in traditional FL, we efficiently calculate optimal weighted model combinations for each client, based on figuring out how much a client can benefit from another's model. We do not assume knowledge of any underlying data distributions or client similarities, and allow each client to optimize for arbitrary target distributions of interest, enabling greater flexibility for personalization. We evaluate and characterize our method on a variety of federated settings, datasets, and degrees of local data heterogeneity. Our method outperforms existing alternatives, while also enabling new features for personalized FL such as transfer outside of local data distributions.",
+    "title": "Personalized Federated Learning with First Order Model Optimization",
+    "authors": [
+      "Michael Zhang",
+      "Karan Sapra",
+      "Sanja Fidler",
+      "Serena Yeung",
+      "Jose M. Alvarez"
+    ],
+    "emails": [
+      "~Michael_Zhang4",
+      "~Serena_Yeung1",
+      "~Sanja_Fidler1",
+      "~Jose_M._Alvarez2",
+      "~Karan_Sapra2"
+    ],
+    "rank": 673
+  },
+  {
+    "url": "https://openreview.net/forum?id=B5VvQrI49Pa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.23",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Predicting the behaviors of Hamiltonian systems has been drawing increasing attention in scientific machine learning. However, the vast majority of the literature was focused on predicting separable Hamiltonian systems with their kinematic and potential energy terms being explicitly decoupled, while building data-driven paradigms to predict nonseparable Hamiltonian systems that are ubiquitous in fluid dynamics and quantum mechanics were rarely explored. The main computational challenge lies in the effective embedding of symplectic priors to describe the inherently coupled evolution of position and momentum, which typically exhibits intricate dynamics. To solve the problem, we propose a novel neural network architecture, Nonseparable Symplectic Neural Networks (NSSNNs), to uncover and embed the symplectic structure of a nonseparable Hamiltonian system from limited observation data. The enabling mechanics of our approach is an augmented symplectic time integrator to decouple the position and momentum energy terms and facilitate their evolution. We demonstrated the efficacy and versatility of our method by predicting a wide range of Hamiltonian systems, both separable and nonseparable, including chaotic vortical flows. We showed the unique computational merits of our approach to yield long-term, accurate, and robust predictions for large-scale Hamiltonian systems by rigorously enforcing symplectomorphism.",
+    "title": "Nonseparable Symplectic Neural Networks",
+    "authors": [
+      "Shiying Xiong",
+      "Yunjin Tong",
+      "Xingzhe He",
+      "Shuqi Yang",
+      "Cheng Yang",
+      "Bo Zhu"
+    ],
+    "emails": [
+      "dartmouth.edu",
+      "~Bo_Zhu2",
+      "~Shiying_Xiong1",
+      "bytedance.com",
+      "~Xingzhe_He1",
+      "~Shuqi_Yang2"
+    ],
+    "rank": 674
+  },
+  {
+    "url": "https://openreview.net/forum?id=crAi7c41xTh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "The noise in stochastic gradient descent (SGD) provides a crucial implicit regularization effect for training overparameterized models. Prior theoretical work largely focuses on spherical Gaussian noise, whereas empirical studies demonstrate the phenomenon that parameter-dependent noise --- induced by mini-batches or label perturbation --- is far more effective than Gaussian noise. \nThis paper theoretically characterizes this phenomenon on a quadratically-parameterized model introduced by Vaskevicius et al. and Woodworth et al.  We show that in an over-parameterized setting, SGD with label noise recovers the sparse ground-truth with an arbitrary initialization, whereas SGD with Gaussian noise or gradient descent overfits to dense solutions with large norms. Our analysis reveals that parameter-dependent noise introduces a bias towards local minima with smaller noise variance, whereas spherical Gaussian noise does not. ",
+    "title": "Shape Matters: Understanding the Implicit Bias of the Noise Covariance",
+    "authors": [
+      "Jeff Z. HaoChen",
+      "Colin Wei",
+      "Jason D. Lee",
+      "Tengyu Ma"
+    ],
+    "emails": [
+      "~Tengyu_Ma1",
+      "~Jason_D._Lee1",
+      "~Colin_Wei1",
+      "~Jeff_Z._HaoChen1"
+    ],
+    "rank": 675
+  },
+  {
+    "url": "https://openreview.net/forum?id=K3qa-sMHpQX",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.22",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Machine Learning (ML) has a potential to dramatically accelerate large-scale physics-based simulations. However, practical models for real large-scale and complex problems remain out of reach. Here we present ForceNet, a model for accurate and fast quantum chemistry simulations to accelerate catalyst discovery for renewable energy applications. ForceNet is a graph neural network that uses surrounding 3D molecular structure to estimate per-atom forces---a central capability for performing atomic simulations. The key challenge is to accurately capture highly complex and non-linear quantum interactions of atoms in 3D space, on which forces are dependent. To this end, ForceNet adopts (1) expressive message passing architecture, (2) appropriate choice of basis and non-linear activation functions, and (3) model scaling in terms of network depth and width. We show ForceNet reduces the estimation error of atomic forces by 30% compared to existing ML models, and generalizes well to out-of-distribution structures. Finally,  we apply ForceNet to the large-scale catalyst dataset, OC20. We use ForceNet to perform quantum chemistry simulations, where ForceNet is able to achieve 4x higher success rate than existing ML models. Overall, we demonstrate the potential for ML-based simulations to achieve practical usefulness while being orders of magnitude faster than physics-based simulations.",
+    "title": "ForceNet: A Graph Neural Network for Large-Scale Quantum Chemistry Simulation",
+    "authors": [
+      "Weihua Hu",
+      "Muhammed Shuaibi",
+      "Abhishek Das",
+      "Siddharth Goyal",
+      "Anuroop Sriram",
+      "Jure Leskovec",
+      "Devi Parikh",
+      "Larry Zitnick"
+    ],
+    "emails": [
+      "~Weihua_Hu1",
+      "~Abhishek_Das1",
+      "~Devi_Parikh1",
+      "~Muhammed_Shuaibi1",
+      "~Jure_Leskovec1",
+      "~Larry_Zitnick1",
+      "~Siddharth_Goyal2",
+      "~Anuroop_Sriram1"
+    ],
+    "rank": 676
+  },
+  {
+    "url": "https://openreview.net/forum?id=73WTGs96kho",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.22",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "A challenging open question in deep learning is how to handle tabular data. Unlike domains such as image and natural language processing, where deep architectures prevail, there is still no widely accepted neural architecture that dominates tabular data. As a step toward bridging this gap, we present Net-DNF a novel generic architecture whose inductive bias elicits models whose structure corresponds to logical Boolean formulas in disjunctive normal form (DNF) over affine soft-threshold decision terms. Net-DNFs also promote localized decisions that are taken over small subsets of the features. We present an extensive experiments showing that Net-DNFs significantly and consistently outperform fully connected networks over tabular data. With relatively few hyperparameters, Net-DNFs open the door to practical end-to-end handling of tabular data using neural networks. We present ablation studies, which justify the design choices of Net-DNF including the inductive bias elements, namely, Boolean formulation, locality, and feature selection. \n",
+    "title": "Net-DNF: Effective Deep Modeling of Tabular Data",
+    "authors": [
+      "Liran Katzir",
+      "Gal Elidan",
+      "Ran El-Yaniv"
+    ],
+    "emails": [
+      "~Liran_Katzir1",
+      "~Gal_Elidan1",
+      "~Ran_El-Yaniv1"
+    ],
+    "rank": 677
+  },
+  {
+    "url": "https://openreview.net/forum?id=kHSu4ebxFXY",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      8,
+      6,
+      7,
+      4
+    ],
+    "rating": "6.22",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Searching for novel molecules with desired chemical properties is crucial in drug discovery. Existing work focuses on developing neural models to generate either molecular sequences or chemical graphs. However, it remains a big challenge to find novel and diverse compounds satisfying several properties. In this paper, we propose MARS, a method for multi-objective drug molecule discovery. MARS is based on the idea of generating the chemical candidates by iteratively editing fragments of molecular graphs. To search for high-quality candidates, it employs Markov chain Monte Carlo sampling (MCMC) on molecules with an annealing scheme and an adaptive proposal. To further improve sample efficiency, MARS uses a graph neural network (GNN) to represent and select candidate edits, where the GNN is trained on-the-fly with samples from MCMC. Experiments show that MARS achieves state-of-the-art performance in various multi-objective settings where molecular bio-activity, drug-likeness, and synthesizability are considered. Remarkably, in the most challenging setting where all four objectives are simultaneously optimized, our approach outperforms previous methods significantly in comprehensive evaluations. The code is available at <a href=\"https://github.com/yutxie/mars\" target=\"_blank\" rel=\"nofollow\">https://github.com/yutxie/mars</a>.",
+    "title": "MARS: Markov Molecular Sampling for Multi-objective Drug Discovery",
+    "authors": [
+      "Yutong Xie",
+      "Chence Shi",
+      "Hao Zhou",
+      "Yuwei Yang",
+      "Weinan Zhang",
+      "Yong Yu",
+      "Lei Li"
+    ],
+    "emails": [
+      "~Yutong_Xie3",
+      "~Chence_Shi1",
+      "~Yong_Yu1",
+      "bytedance.com",
+      "~Weinan_Zhang1",
+      "~Lei_Li11"
+    ],
+    "rank": 678
+  },
+  {
+    "url": "https://openreview.net/forum?id=v5gjXpmR8J",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.22",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We ask the following question: what training information is required to design an effective outlier/out-of-distribution (OOD) detector, i.e., detecting samples that lie far away from training distribution? Since unlabeled data is easily accessible for many applications, the most compelling approach is to develop detectors based on only unlabeled in-distribution data. However, we observe that most existing detectors based on unlabeled data perform poorly, often equivalent to a random prediction. In contrast, existing state-of-the-art OOD detectors achieve impressive performance but require access to fine-grained data labels for supervised training. We propose SSD, an outlier detector based on only unlabeled in-distribution data. We use self-supervised representation learning followed by a Mahalanobis distance based detection in the feature space. We demonstrate that SSD outperforms most existing detectors based on unlabeled data by a large margin. Additionally, SSD even achieves performance on par, and sometimes even better, with supervised training based detectors.  Finally, we expand our detection framework with two key extensions. First, we formulate few-shot OOD detection, in which the detector has access to only one to five samples from each class of the targeted OOD dataset. Second, we extend our framework to incorporate training data labels, if available. We find that our novel detection framework based on SSD displays enhanced performance with these extensions, and achieves state-of-the-art performance. Our code is publicly available at <a href=\"https://github.com/inspire-group/SSD\" target=\"_blank\" rel=\"nofollow\">https://github.com/inspire-group/SSD</a>.",
+    "title": "SSD: A Unified Framework for Self-Supervised Outlier Detection",
+    "authors": [
+      "Vikash Sehwag",
+      "Mung Chiang",
+      "Prateek Mittal"
+    ],
+    "emails": [
+      "~Mung_Chiang2",
+      "~Prateek_Mittal1",
+      "~Vikash_Sehwag1"
+    ],
+    "rank": 679
+  },
+  {
+    "url": "https://openreview.net/forum?id=BVSM0x3EDK6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.21",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "While successful for various computer vision tasks, deep neural networks have shown to be vulnerable to texture style shifts and small perturbations to which humans are robust. In this work, we show that the robustness of neural networks can be greatly improved through the use of random convolutions as data augmentation. Random convolutions are approximately shape-preserving and may distort local textures. Intuitively, randomized convolutions create an infinite number of new domains with similar global shapes but random local texture. Therefore, we explore using outputs of multi-scale random convolutions as new images or mixing them with the original images during training. When applying a network trained with our approach to unseen domains, our method consistently improves the performance on domain generalization benchmarks and is scalable to ImageNet. In particular, in the challenging scenario of generalizing to the sketch domain in PACS and to ImageNet-Sketch, our method outperforms state-of-art methods by a large margin. More interestingly, our method can benefit downstream tasks by providing a more robust pretrained visual representation.",
+    "title": "Robust and Generalizable Visual Representation Learning via Random Convolutions",
+    "authors": [
+      "Zhenlin Xu",
+      "Deyi Liu",
+      "Junlin Yang",
+      "Colin Raffel",
+      "Marc Niethammer"
+    ],
+    "emails": [
+      "~Junlin_Yang1",
+      "~Colin_Raffel1",
+      "~Marc_Niethammer1",
+      "~Zhenlin_Xu1",
+      "~Deyi_Liu1"
+    ],
+    "rank": 680
+  },
+  {
+    "url": "https://openreview.net/forum?id=kEnBH98BGs5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.21",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We define a notion of information that an individual sample provides to the training of a neural network, and we specialize it to measure both how much a sample informs the final weights and how much it informs the function computed by the weights. Though related, we show that these quantities have a  qualitatively different behavior. We give efficient approximations of these quantities using a linearized network and demonstrate empirically that the approximation is accurate for real-world architectures, such as pre-trained ResNets. We apply these measures to several problems, such as dataset summarization, analysis of under-sampled classes, comparison of informativeness of different data sources, and detection of adversarial and corrupted examples. Our work generalizes existing frameworks, but enjoys better computational properties for heavily over-parametrized models, which makes it possible to apply it to real-world networks.",
+    "title": "Estimating informativeness of samples with Smooth Unique Information",
+    "authors": [
+      "Hrayr Harutyunyan",
+      "Alessandro Achille",
+      "Giovanni Paolini",
+      "Orchid Majumder",
+      "Avinash Ravichandran",
+      "Rahul Bhotika",
+      "Stefano Soatto"
+    ],
+    "emails": [
+      "~Avinash_Ravichandran1",
+      "~Hrayr_Harutyunyan1",
+      "~Rahul_Bhotika1",
+      "~Orchid_Majumder1",
+      "~Alessandro_Achille1",
+      "~Giovanni_Paolini1",
+      "~Stefano_Soatto1"
+    ],
+    "rank": 681
+  },
+  {
+    "url": "https://openreview.net/forum?id=OQ08SN70M1V",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.21",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Although widely adopted, existing approaches for fine-tuning pre-trained language models have been shown to be unstable across hyper-parameter settings, motivating recent work on trust region methods. In this paper, we present a simplified and efficient method rooted in trust region theory that replaces previously used adversarial objectives with parametric noise (sampling from either a normal or uniform distribution), thereby discouraging representation change during fine-tuning when possible without hurting performance. We also introduce a new analysis to motivate the use of trust region methods more generally, by studying representational collapse; the degradation of generalizable representations from pre-trained models as they are fine-tuned for a specific end task. Extensive experiments show that our fine-tuning method matches or exceeds the performance of previous trust region methods on a range of understanding and generation tasks (including DailyMail/CNN, Gigaword, Reddit TIFU, and the GLUE benchmark), while also being much faster. We also show that it is less prone to representation collapse; the pre-trained models maintain more generalizable representations every time they are fine-tuned.",
+    "title": "Better Fine-Tuning by Reducing Representational Collapse",
+    "authors": [
+      "Armen Aghajanyan",
+      "Akshat Shrivastava",
+      "Anchit Gupta",
+      "Naman Goyal",
+      "Luke Zettlemoyer",
+      "Sonal Gupta"
+    ],
+    "emails": [
+      "~Naman_Goyal1",
+      "~Armen_Aghajanyan1",
+      "fb.com",
+      "~Luke_Zettlemoyer1"
+    ],
+    "rank": 682
+  },
+  {
+    "url": "https://openreview.net/forum?id=SK7A5pdrgov",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      4,
+      6
+    ],
+    "rating": "6.21",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Despite recent successes of reinforcement learning (RL), it remains a challenge for agents to transfer learned skills to related environments. To facilitate research addressing this problem, we proposeCausalWorld, a benchmark for causal structure and transfer learning in a robotic manipulation environment. The environment is a simulation of an open-source robotic platform, hence offering the possibility of sim-to-real transfer. Tasks consist of constructing 3D shapes from a set of blocks - inspired by how children learn to build complex structures. The key strength of CausalWorld is that it provides a combinatorial family of such tasks with common causal structure and underlying factors (including, e.g., robot and object masses, colors, sizes). The user (or the agent) may intervene on all causal variables, which allows  for  fine-grained  control  over  how  similar  different  tasks  (or  task  distributions) are. One can thus easily define training and evaluation distributions of a desired difficulty level,  targeting a specific form of generalization (e.g.,  only changes in appearance or object mass). Further, this common parametrization facilitates defining curricula by interpolating between an initial and a target task. While users may define their own task distributions, we present eight meaningful distributions as concrete benchmarks, ranging from simple to very challenging, all of which require long-horizon planning as well as precise low-level motor control. Finally, we provide baseline results for a subset of these tasks on distinct training curricula and corresponding evaluation protocols, verifying the feasibility of the tasks in this benchmark.",
+    "title": "CausalWorld: A Robotic Manipulation Benchmark for Causal Structure and Transfer Learning",
+    "authors": [
+      "Ossama Ahmed",
+      "Frederik Tr\u00e4uble",
+      "Anirudh Goyal",
+      "Alexander Neitz",
+      "Manuel Wuthrich",
+      "Yoshua Bengio",
+      "Bernhard Sch\u00f6lkopf",
+      "Stefan Bauer"
+    ],
+    "emails": [
+      "~Manuel_Wuthrich1",
+      "~Frederik_Tr\u00e4uble1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Stefan_Bauer1",
+      "~Yoshua_Bengio1",
+      "~Anirudh_Goyal1",
+      "~Alexander_Neitz1",
+      "~Ossama_Ahmed1"
+    ],
+    "rank": 683
+  },
+  {
+    "url": "https://openreview.net/forum?id=TSRTzJnuEBS",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.21",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Autoregressive models are widely used for tasks such as image and audio generation. The sampling process of these models, however, does not allow interruptions and cannot adapt to real-time computational resources. This challenge impedes the deployment of powerful autoregressive models, which involve a slow sampling process that is sequential in nature and typically scales linearly with respect to the data dimension.  To address this difficulty, we propose a new family of autoregressive models that enables anytime sampling. Inspired by Principal Component Analysis, we learn a structured representation space where dimensions are ordered based on their importance with respect to reconstruction. Using an autoregressive model in this latent space, we trade off sample quality for computational efficiency by truncating the generation process before decoding into the original data space. Experimentally, we demonstrate in several image and audio generation tasks that sample quality degrades gracefully as we reduce the computational budget for sampling. The approach suffers almost no loss in sample quality (measured by FID) using only 60\\% to 80\\% of all latent dimensions for image data. Code is available at <a href=\"https://github.com/Newbeeer/Anytime-Auto-Regressive-Model\" target=\"_blank\" rel=\"nofollow\">https://github.com/Newbeeer/Anytime-Auto-Regressive-Model</a>.",
+    "title": "Anytime Sampling for Autoregressive Models via Ordered Autoencoding",
+    "authors": [
+      "Yilun Xu",
+      "Yang Song",
+      "Sahaj Garg",
+      "Linyuan Gong",
+      "Rui Shu",
+      "Aditya Grover",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Yang_Song1",
+      "~Yilun_Xu1",
+      "hotmail.com",
+      "~Aditya_Grover1",
+      "~Sahaj_Garg1",
+      "~Rui_Shu1",
+      "~Stefano_Ermon1"
+    ],
+    "rank": 684
+  },
+  {
+    "url": "https://openreview.net/forum?id=-IXhmY16R3M",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.21",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we explain the universal approximation capabilities of deep residual neural networks through geometric nonlinear control. Inspired by recent work establishing links between residual networks and control systems, we provide a general sufficient condition for a residual network to have the power of universal approximation by asking the activation function, or one of its derivatives, to satisfy a quadratic differential equation. Many activation functions used in practice satisfy this assumption, exactly or approximately, and we show this property to be sufficient for an adequately deep neural network with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>+</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container> neurons per\nlayer to approximate arbitrarily well, on a compact set and with respect to the supremum norm, any continuous function from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup></math></mjx-assistive-mml></mjx-container>. We further show this result to hold for very simple architectures for which the weights only need to assume two values. The first key technical contribution consists of relating the universal approximation problem to controllability of an ensemble of control systems corresponding to a residual network and to leverage classical Lie algebraic techniques to characterize controllability. The second technical contribution is to identify monotonicity as the bridge between controllability of finite ensembles and uniform approximability on compact sets.",
+    "title": "Universal approximation power of deep residual neural networks via nonlinear control theory",
+    "authors": [
+      "Paulo Tabuada",
+      "Bahman Gharesifard"
+    ],
+    "emails": [
+      "~Paulo_Tabuada1",
+      "queensu.ca"
+    ],
+    "rank": 685
+  },
+  {
+    "url": "https://openreview.net/forum?id=w_7JMpGZRh0",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.20",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we introduce Watch-And-Help (WAH), a challenge for testing social intelligence in agents. In WAH, an AI agent needs to help a human-like agent perform a complex household task efficiently. To succeed, the AI agent needs to i) understand the underlying goal of the task by watching a single demonstration of the human-like agent performing the same task (social perception), and ii) coordinate with the human-like agent to solve the task in an unseen environment as fast as possible (human-AI collaboration). For this challenge, we build VirtualHome-Social, a multi-agent household environment, and provide a benchmark including both planning and learning based baselines. We evaluate the performance of AI agents with the human-like agent as well as and with real humans using objective metrics and subjective user ratings. Experimental results demonstrate that our challenge and virtual environment enable a systematic evaluation on the important aspects of machine social intelligence at scale.",
+    "title": "Watch-And-Help: A Challenge for Social Perception and Human-AI Collaboration",
+    "authors": [
+      "Xavier Puig",
+      "Tianmin Shu",
+      "Shuang Li",
+      "Zilin Wang",
+      "Yuan-Hong Liao",
+      "Joshua B. Tenenbaum",
+      "Sanja Fidler",
+      "Antonio Torralba"
+    ],
+    "emails": [
+      "~Tianmin_Shu1",
+      "~Antonio_Torralba1",
+      "~Shuang_Li5",
+      "~Joshua_B._Tenenbaum1",
+      "~Xavier_Puig1",
+      "~Sanja_Fidler1",
+      "~Zilin_Wang2",
+      "~Yuan-Hong_Liao2"
+    ],
+    "rank": 686
+  },
+  {
+    "url": "https://openreview.net/forum?id=QoWatN-b8T",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Episodic and semantic memory are critical components of the human memory model. The theory of complementary learning systems (McClelland et al., 1995) suggests that the compressed representation produced by a serial event (episodic memory) is later restructured to build a more generalized form of reusable knowledge (semantic memory). In this work, we develop a new principled Bayesian memory allocation scheme that bridges the gap between episodic and semantic memory via a hierarchical latent variable model. We take inspiration from traditional heap allocation and extend the idea of locally contiguous memory to the Kanerva Machine, enabling a novel differentiable block allocated latent memory. In contrast to the Kanerva Machine, we simplify the process of memory writing by treating it as a fully feed forward deterministic process, relying on the stochasticity of the read key distribution to disperse information within the memory. We demonstrate that this allocation scheme improves performance in memory conditional image generation, resulting in new state-of-the-art conditional likelihood values on binarized MNIST (\u226441.58 nats/image) , binarized Omniglot (\u226466.24 nats/image), as well as presenting competitive performance on CIFAR10, DMLab Mazes, Celeb-A and ImageNet32\u00d732.",
+    "title": "Kanerva++: Extending the Kanerva Machine With Differentiable, Locally Block Allocated Latent Memory",
+    "authors": [
+      "Jason Ramapuram",
+      "Yan Wu",
+      "Alexandros Kalousis"
+    ],
+    "emails": [
+      "~Jason_Ramapuram1",
+      "~Yan_Wu1",
+      "~Alexandros_Kalousis1"
+    ],
+    "rank": 687
+  },
+  {
+    "url": "https://openreview.net/forum?id=xHKVVHGDOEk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.20",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Influence functions approximate the effect of training samples in test-time predictions and have a wide variety of applications in machine learning interpretability and uncertainty estimation. A commonly-used (first-order) influence function can be implemented efficiently as a post-hoc method requiring access only to the gradients and Hessian of the model. For linear models, influence functions are well-defined due to the convexity of the underlying loss function and are generally accurate even across difficult settings where model changes are fairly large such as estimating group influences. Influence functions, however, are not well-understood in the context of deep learning with non-convex loss functions.  In this paper, we provide a comprehensive and large-scale empirical study of successes and failures of influence functions in neural network models trained on datasets such as Iris, MNIST, CIFAR-10 and ImageNet. Through our extensive experiments, we show that the network architecture, its depth and width, as well as the extent of model parameterization and regularization techniques have strong effects in the accuracy of influence functions. In particular, we find that (i) influence estimates are fairly accurate for shallow networks, while for deeper networks the estimates are often erroneous; (ii) for certain network architectures and datasets, training with weight-decay regularization is important to get high-quality influence estimates; and (iii) the accuracy of influence estimates can vary significantly depending on the examined test points. These results suggest that in general influence functions in deep learning are fragile and call for developing improved influence estimation methods to mitigate these issues in non-convex setups.",
+    "title": "Influence Functions in Deep Learning Are Fragile",
+    "authors": [
+      "Samyadeep Basu",
+      "Phil Pope",
+      "Soheil Feizi"
+    ],
+    "emails": [
+      "~Samyadeep_Basu1",
+      "~Soheil_Feizi2",
+      "~Phil_Pope1"
+    ],
+    "rank": 688
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qk-Wq5AIjpq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      2,
+      4
+    ],
+    "abstract": "A key challenge for deploying deep neural networks (DNNs) in safety critical settings is the need to provide rigorous ways to quantify their uncertainty. In this paper, we propose a novel algorithm for constructing predicted classification confidences for DNNs that comes with provable correctness guarantees. Our approach uses Clopper-Pearson confidence intervals for the Binomial distribution in conjunction with the histogram binning approach to calibrated prediction. In addition, we demonstrate how our predicted confidences can be used to enable downstream guarantees in two settings: (i) fast DNN inference, where we demonstrate how to compose a fast but inaccurate DNN with an accurate but slow DNN in a rigorous way to improve performance without sacrificing accuracy, and (ii) safe planning, where we guarantee safety when using a DNN to predict whether a given action is safe based on visual observations. In our experiments, we demonstrate that our approach can be used to provide guarantees for state-of-the-art DNNs.",
+    "title": "PAC Confidence Predictions for Deep Neural Network Classifiers",
+    "authors": [
+      "Sangdon Park",
+      "Shuo Li",
+      "Insup Lee",
+      "Osbert Bastani"
+    ],
+    "emails": [
+      "~Osbert_Bastani1",
+      "seas.upenn.edu",
+      "~Sangdon_Park1",
+      "~Insup_Lee1"
+    ],
+    "rank": 689
+  },
+  {
+    "url": "https://openreview.net/forum?id=qYZD-AO1Vn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Trust region methods are a popular tool in reinforcement learning as they yield robust policy updates in continuous and discrete action spaces. However, enforcing such trust regions in deep reinforcement learning is difficult. Hence, many approaches, such as Trust Region Policy Optimization (TRPO) and Proximal Policy Optimization (PPO), are based on approximations. Due to those approximations, they violate the constraints or fail to find the optimal solution within the trust region. Moreover, they are difficult to implement, often lack sufficient exploration, and have been shown to depend on seemingly unrelated implementation choices. In this work, we propose differentiable neural network layers to enforce trust regions for deep Gaussian policies via closed-form projections. Unlike existing methods, those layers formalize trust regions for each state individually and can complement existing reinforcement learning algorithms. We derive trust region projections based on the Kullback-Leibler divergence, the Wasserstein L2 distance, and the Frobenius norm for Gaussian distributions. We empirically demonstrate that those projection layers achieve similar or better results than existing methods while being almost agnostic to specific implementation choices. The code is available at <a href=\"https://git.io/Jthb0\" target=\"_blank\" rel=\"nofollow\">https://git.io/Jthb0</a>.\n",
+    "title": "Differentiable Trust Region Layers for Deep Reinforcement Learning",
+    "authors": [
+      "Fabian Otto",
+      "Philipp Becker",
+      "Vien Anh Ngo",
+      "Hanna Carolin Maria Ziesche",
+      "Gerhard Neumann"
+    ],
+    "emails": [
+      "~Philipp_Becker1",
+      "~Vien_Anh_Ngo1",
+      "~Fabian_Otto1",
+      "~Hanna_Carolin_Maria_Ziesche1",
+      "~Gerhard_Neumann1"
+    ],
+    "rank": 690
+  },
+  {
+    "url": "https://openreview.net/forum?id=tw60PTRSda2",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In order to interact with objects in our environment, we rely on an understanding of the actions that can be performed on them, and the extent to which they rely or have an effect on the properties of the object. This knowledge is called the object \"affordance\". We propose an approach for creating an embedding of objects in an affordance space, in which each dimension corresponds to an aspect of meaning shared by many actions, using text corpora. This embedding makes it possible to predict which verbs will be applicable to a given object, as captured in human judgments of affordance, better than a variety of alternative approaches. Furthermore, we show that the dimensions learned are interpretable, and that they correspond to typical patterns of interaction with objects. Finally, we show that the dimensions can be used to predict a state-of-the-art mental representation of objects, derived purely from human judgements of object similarity.",
+    "title": "Understanding Mental Representations Of Objects Through Verbs Applied To Them",
+    "authors": [
+      "Ka Chun Lam",
+      "Francisco Pereira",
+      "Maryam Vaziri-Pashkam",
+      "Kristin Woodard",
+      "Emalie McMahon"
+    ],
+    "emails": [
+      "~Ka_Chun_Lam1",
+      "jhu.edu",
+      "~Maryam_Vaziri-Pashkam1",
+      "nih.gov",
+      "~Francisco_Pereira1"
+    ],
+    "rank": 691
+  },
+  {
+    "url": "https://openreview.net/forum?id=vNw0Gzw8oki",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      5,
+      7
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Deep kernel learning is a promising combination of deep neural networks and nonparametric function estimation. However, as a data-driven approach, the performance of deep kernel learning can still be restricted by scarce or insufficient data, especially in extrapolation tasks.  To address these limitations, we propose Physics Informed Deep Kernel Learning (PI-DKL) that exploits physics  knowledge represented by differential equations with latent sources. Specifically, we use the posterior function sample of the  Gaussian process as the surrogate for the solution of the differential equation, and construct  a generative component to integrate the equation in a principled Bayesian hybrid framework. For efficient and effective inference, we marginalize out the latent variables in the joint probability and derive a simple model evidence lower bound (ELBO), based on which we develop a stochastic collapsed inference algorithm. Our ELBO can be viewed as a nice, interpretable posterior regularization objective. On synthetic datasets and real-world applications, we show the advantage of our approach in both prediction accuracy and uncertainty quantification. ",
+    "title": "Physics Informed Deep Kernel Learning",
+    "authors": [
+      "Zheng Wang",
+      "Wei Xing",
+      "Robert Kirby",
+      "Shandian Zhe"
+    ],
+    "emails": [
+      "~Robert_Kirby1",
+      "~Shandian_Zhe1",
+      "sci.utah.edu",
+      "~Zheng_Wang2"
+    ],
+    "rank": 692
+  },
+  {
+    "url": "https://openreview.net/forum?id=IrM64DGB21",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Model-based planning is often thought to be necessary for deep, careful reasoning and generalization in artificial agents. While recent successes of model-based reinforcement learning (MBRL) with deep function approximation have strengthened this hypothesis, the resulting diversity of model-based methods has also made it difficult to track which components drive success and why. In this paper, we seek to disentangle the contributions of recent methods by focusing on three questions: (1) How does planning benefit MBRL agents? (2) Within planning, what choices drive performance? (3) To what extent does planning improve generalization? To answer these questions, we study the performance of MuZero (Schrittwieser et al., 2019), a state-of-the-art MBRL algorithm with strong connections and overlapping components with many other MBRL algorithms. We perform a number of interventions and ablations of MuZero across a wide range of environments, including control tasks, Atari, and 9x9 Go. Our results suggest the following: (1) Planning is most useful in the learning process, both for policy updates and for providing a more useful data distribution. (2) Using shallow trees with simple Monte-Carlo rollouts is as performant as more complex methods, except in the most difficult reasoning tasks. (3) Planning alone is insufficient to drive strong generalization. These results indicate where and how to utilize planning in reinforcement learning settings, and highlight a number of open questions for future MBRL research.",
+    "title": "On the role of planning in model-based deep reinforcement learning",
+    "authors": [
+      "Jessica B Hamrick",
+      "Abram L. Friesen",
+      "Feryal Behbahani",
+      "Arthur Guez",
+      "Fabio Viola",
+      "Sims Witherspoon",
+      "Thomas Anthony",
+      "Lars Holger Buesing",
+      "Petar Veli\u010dkovi\u0107",
+      "Theophane Weber"
+    ],
+    "emails": [
+      "~Abram_L._Friesen1",
+      "~Lars_Holger_Buesing1",
+      "~Petar_Veli\u010dkovi\u01071",
+      "~Fabio_Viola2",
+      "~Jessica_B_Hamrick1",
+      "google.com",
+      "~Arthur_Guez1",
+      "~Thomas_Anthony1",
+      "~Theophane_Weber1",
+      "~Feryal_Behbahani1"
+    ],
+    "rank": 693
+  },
+  {
+    "url": "https://openreview.net/forum?id=G70Z8ds32C9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      9,
+      6
+    ],
+    "rating": "6.20",
+    "confidences": [
+      3,
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "This work attempts to interpret modern deep (convolutional) networks from the principles of rate reduction and (shift) invariant classification. We show that the basic iterative gradient ascent scheme for maximizing the rate reduction of learned features naturally leads to a deep network, one iteration per layer. The architectures, operators (linear or nonlinear), and parameters of the network are all explicitly constructed layer-by-layer in a forward propagation fashion. All components of this ``white box'' network have precise optimization, statistical, and geometric interpretation. Our preliminary experiments indicate that such a network can already learn a good discriminative deep representation without any back propagation training. Moreover, all linear operators of the so-derived network naturally become multi-channel convolutions when we enforce classification to be rigorously shift-invariant. The derivation also indicates that such a convolutional network is significantly more efficient to learn and construct in the spectral domain.",
+    "title": "Deep Networks from the Principle of Rate Reduction",
+    "authors": [
+      "Kwan Ho Ryan Chan",
+      "Yaodong Yu",
+      "Chong You",
+      "Haozhi Qi",
+      "John Wright",
+      "Yi Ma"
+    ],
+    "emails": [
+      "~Haozhi_Qi1",
+      "~John_Wright1",
+      "~Yaodong_Yu4",
+      "~Chong_You2",
+      "berkeley.edu",
+      "~Yi_Ma4"
+    ],
+    "rank": 694
+  },
+  {
+    "url": "https://openreview.net/forum?id=HxzSxSxLOJZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "A key appeal of the recently proposed Neural Ordinary Differential Equation (ODE) framework is that it seems to provide a continuous-time extension of discrete residual neural networks. \nAs we show herein, though, trained Neural ODE models actually depend on the specific numerical method used during training.\nIf the trained model is supposed to be a flow generated from an ODE, it should be possible to choose another numerical solver with equal or smaller numerical error without loss of performance.\nWe observe that if training relies on a solver with overly coarse discretization, then testing with another solver of equal or smaller numerical error results in a sharp drop in accuracy. \nIn such cases, the combination of vector field and numerical method cannot be interpreted as a flow generated from an ODE, which arguably poses a fatal breakdown of the Neural ODE concept.\nWe observe, however, that there exists a critical step size beyond which the training yields a valid ODE vector field. \nWe propose a method that monitors the behavior of the ODE solver during training to adapt its step size, aiming to ensure a valid ODE without unnecessarily increasing computational cost.\nWe verify this adaption algorithm on a common bench mark dataset as well as a synthetic dataset. \n",
+    "title": "ResNet After All: Neural ODEs and Their Numerical Solution",
+    "authors": [
+      "Katharina Ott",
+      "Prateek Katiyar",
+      "Philipp Hennig",
+      "Michael Tiemann"
+    ],
+    "emails": [
+      "~Philipp_Hennig1",
+      "de.bosch.com",
+      "~Michael_Tiemann1",
+      "~Katharina_Ott1"
+    ],
+    "rank": 695
+  },
+  {
+    "url": "https://openreview.net/forum?id=Io8oYQb4LRK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      7
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      2,
+      2,
+      2
+    ],
+    "abstract": "Gradient-based meta-learning has earned a widespread popularity in few-shot learning, but remains broadly impractical for tasks with long horizons (many gradient steps), due to memory scaling and gradient degradation issues. A common workaround is to learn meta-parameters online, but this introduces greediness which comes with a significant performance drop. In this work, we enable non-greedy meta-learning of hyperparameters over long horizons by sharing hyperparameters that are contiguous in time, and using the sign of hypergradients rather than their magnitude to indicate convergence. We implement this with forward-mode differentiation, which we extend to the popular momentum-based SGD optimizer. We demonstrate that the hyperparameters of this optimizer can be learned non-greedily without gradient degradation over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><msup><mn>10</mn><mn>4</mn></msup></math></mjx-assistive-mml></mjx-container> inner gradient steps, by only requiring <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>10</mn></math></mjx-assistive-mml></mjx-container> outer gradient steps. On CIFAR-10, we outperform greedy and random search methods for the same computational budget by nearly <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>. Code will be available upon publication.",
+    "title": "Non-greedy Gradient-based Hyperparameter Optimization Over Long Horizons",
+    "authors": [
+      "Paul Micaelli",
+      "Amos Storkey"
+    ],
+    "emails": [
+      "~Amos_Storkey1",
+      "~Paul_Micaelli1"
+    ],
+    "rank": 696
+  },
+  {
+    "url": "https://openreview.net/forum?id=BXewfAYMmJw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      5,
+      5
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Neural networks are prone to learning shortcuts -- they often model simple correlations, ignoring more complex ones that potentially generalize better. Prior works on image classification show that instead of learning a connection to object shape, deep classifiers tend to exploit spurious correlations with low-level texture or the background for solving the classification task. In this work, we take a step towards more robust and interpretable classifiers that explicitly expose the task's causal structure. Building on current advances in deep generative modeling, we propose to decompose the image generation process into independent causal mechanisms that we train without direct supervision. By exploiting appropriate inductive biases, these mechanisms disentangle object shape, object texture, and background; hence, they allow for generating counterfactual images. We demonstrate the ability of our model to generate such images on MNIST and ImageNet. Further, we show that the counterfactual images can improve out-of-distribution robustness with a marginal drop in performance on the original classification task, despite being synthetic. Lastly, our generative model can be trained efficiently on a single GPU, exploiting common pre-trained models as inductive biases.",
+    "title": "Counterfactual Generative Networks",
+    "authors": [
+      "Axel Sauer",
+      "Andreas Geiger"
+    ],
+    "emails": [
+      "~Andreas_Geiger3",
+      "~Axel_Sauer1"
+    ],
+    "rank": 697
+  },
+  {
+    "url": "https://openreview.net/forum?id=MyHwDabUHZm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      7,
+      4
+    ],
+    "rating": "6.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We find that the way we choose to represent data labels can have a profound effect on the quality of trained models. For example, training an image classifier to regress audio labels rather than traditional categorical probabilities produces a more reliable classification. This result is surprising, considering that audio labels are more complex than simpler numerical probabilities or text. We hypothesize that high dimensional, high entropy label representations are generally more useful because they provide a stronger error signal. We support this hypothesis with evidence from various label representations including constant matrices, spectrograms, shuffled spectrograms, Gaussian mixtures, and uniform random matrices of various dimensionalities. Our experiments reveal that high dimensional, high entropy labels achieve comparable accuracy to text (categorical) labels on standard image classification tasks, but features learned through our label representations exhibit more robustness under various adversarial attacks and better effectiveness with a limited amount of training data. These results suggest that label representation may play a more important role than previously thought.",
+    "title": "Beyond Categorical Label Representations for Image Classification",
+    "authors": [
+      "Boyuan Chen",
+      "Yu Li",
+      "Sunand Raghupathi",
+      "Hod Lipson"
+    ],
+    "emails": [
+      "~Hod_Lipson1",
+      "columbia.edu",
+      "~Boyuan_Chen1"
+    ],
+    "rank": 698
+  },
+  {
+    "url": "https://openreview.net/forum?id=YHdeAO61l6T",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.20",
+    "confidences": [
+      3,
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Designing an incentive compatible auction that maximizes expected revenue is a central problem in Auction Design. While theoretical approaches to the problem have hit some limits, a recent research direction initiated by Duetting et al. (2019) consists in building neural network architectures to find optimal auctions.  We propose two conceptual deviations from their approach which result in enhanced performance. First, we use recent results in theoretical auction design to introduce a time-independent Lagrangian.  This not only circumvents the need for an expensive hyper-parameter search (as in prior work), but also provides a single metric to compare the performance of two auctions (absent from prior work). Second, the optimization procedure in previous work uses an inner maximization loop to compute optimal misreports. We amortize this process through the introduction of an additional neural network. We demonstrate the effectiveness of our approach by learning competitive or strictly improved auctions compared to prior work. Both results together further imply a novel formulation of Auction Design as a two-player game with stationary utility functions.",
+    "title": "Auction Learning as a Two-Player Game",
+    "authors": [
+      "Jad Rahme",
+      "Samy Jelassi",
+      "S. Matthew Weinberg"
+    ],
+    "emails": [
+      "~Samy_Jelassi1",
+      "~Jad_Rahme1",
+      "~S._Matthew_Weinberg1"
+    ],
+    "rank": 699
+  },
+  {
+    "url": "https://openreview.net/forum?id=80FMcTSZ6J0",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      7,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.19",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The noise in stochastic gradient descent (SGD) provides a crucial implicit regularization effect, previously studied in optimization by analyzing the dynamics of parameter updates. In this paper, we are interested in learning with noisy labels, where we have a collection of samples with potential mislabeling. We show that a previously rarely discussed SGD noise, induced by stochastic label noise (SLN), mitigates the effects of inherent label noise. In contrast, the common SGD noise directly applied to model parameters does not. We formalize the differences and connections of SGD noise variants, showing that SLN induces SGD noise dependent on the sharpness of output landscape and the confidence of output probability, which may help escape from sharp minima and prevent overconfidence. SLN not only improves generalization in its simplest form but also boosts popular robust training methods, including sample selection and label correction. Specifically, we present an enhanced algorithm by applying SLN to label correction. Our code is released.",
+    "title": "Noise against noise: stochastic label noise helps combat inherent label noise",
+    "authors": [
+      "Pengfei Chen",
+      "Guangyong Chen",
+      "Junjie Ye",
+      "jingwei zhao",
+      "Pheng-Ann Heng"
+    ],
+    "emails": [
+      "siat.ac.cn",
+      "~jingwei_zhao1",
+      "gmail.com",
+      "~Pengfei_Chen1",
+      "~Pheng-Ann_Heng1"
+    ],
+    "rank": 700
+  },
+  {
+    "url": "https://openreview.net/forum?id=N33d7wjgzde",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.19",
+    "confidences": [
+      5,
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Weakly supervised segmentation requires assigning a label to every pixel based on training instances with partial annotations such as image-level tags, object bounding boxes, labeled points and scribbles. This task is challenging, as coarse annotations (tags, boxes) lack precise pixel localization whereas sparse annotations (points, scribbles) lack broad region coverage. Existing methods tackle these two types of weak supervision differently: Class activation maps are used to localize coarse labels and iteratively refine the segmentation model, whereas conditional random fields are used to propagate sparse labels to the entire image.\n\nWe formulate weakly supervised segmentation as a semi-supervised metric learning problem, where pixels of the same (different) semantics need to be mapped to the same (distinctive) features. We propose 4 types of contrastive relationships between pixels and segments in the feature space, capturing low-level image similarity, semantic annotation, co-occurrence, and feature affinity They act as priors; the pixel-wise feature can be learned from training images with any partial annotations in a data-driven fashion. In particular, unlabeled pixels in training images participate not only in data-driven grouping within each image, but also in discriminative feature learning within and across images. We deliver a universal weakly supervised segmenter with significant gains on Pascal VOC and DensePose.",
+    "title": "Universal Weakly Supervised Segmentation by Pixel-to-Segment Contrastive Learning",
+    "authors": [
+      "Tsung-Wei Ke",
+      "Jyh-Jing Hwang",
+      "Stella Yu"
+    ],
+    "emails": [
+      "~Tsung-Wei_Ke2",
+      "~Jyh-Jing_Hwang1",
+      "~Stella_Yu2"
+    ],
+    "rank": 701
+  },
+  {
+    "url": "https://openreview.net/forum?id=KmykpuSrjcq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.19",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "This paper presents Prototypical Contrastive Learning (PCL), an unsupervised representation learning method that bridges contrastive learning with clustering. PCL not only learns low-level features for the task of instance discrimination, but more importantly, it implicitly encodes semantic structures of the data into the learned embedding space. Specifically, we introduce prototypes as latent variables to help find the maximum-likelihood estimation of the network parameters in an Expectation-Maximization framework. We iteratively perform E-step as finding the distribution of prototypes via clustering and M-step as optimizing the network via contrastive learning. We propose ProtoNCE loss, a generalized version of the InfoNCE loss for contrastive learning, which encourages representations to be closer to their assigned prototypes. PCL outperforms state-of-the-art instance-wise contrastive learning methods on multiple benchmarks with substantial improvement in low-resource transfer learning. Code and pretrained models are available at <a href=\"https://github.com/salesforce/PCL\" target=\"_blank\" rel=\"nofollow\">https://github.com/salesforce/PCL</a>.",
+    "title": "Prototypical Contrastive Learning of Unsupervised Representations",
+    "authors": [
+      "Junnan Li",
+      "Pan Zhou",
+      "Caiming Xiong",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Caiming_Xiong1",
+      "~Junnan_Li2",
+      "~Pan_Zhou3"
+    ],
+    "rank": 702
+  },
+  {
+    "url": "https://openreview.net/forum?id=c1xAGI3nYST",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      8,
+      5
+    ],
+    "rating": "6.19",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Variational autoencoders (VAEs) are one of the powerful likelihood-based generative models with applications in various domains. However, they struggle to generate high-quality images, especially when samples are obtained from the prior without any tempering. One explanation for VAEs\u2019 poor generative quality is the prior hole problem: the prior distribution fails to match the aggregate approximate posterior. Due to this mismatch, there exist areas in the latent space with high density under the prior that do not correspond to any encoded image. Samples from those areas are decoded to corrupted images. To tackle this issue, we propose an energy-based prior defined by the product of a base prior distribution and a reweighting factor, designed to bring the base closer to the aggregate posterior. We train the reweighting factor by noise contrastive estimation, and we generalize it to hierarchical VAEs with many latent variable groups. Our experiments confirm that the proposed noise contrastive priors improve the generative performance of state-of-the-art VAEs by a large margin on the MNIST, CIFAR-10, CelebA 64, and CelebA HQ 256 datasets.",
+    "title": "NCP-VAE: Variational Autoencoders with Noise Contrastive Priors",
+    "authors": [
+      "Jyoti Aneja",
+      "Alex Schwing",
+      "Jan Kautz",
+      "Arash Vahdat"
+    ],
+    "emails": [
+      "~Jyoti_Aneja2",
+      "~Arash_Vahdat3",
+      "~Alex_Schwing1",
+      "~Jan_Kautz1"
+    ],
+    "rank": 703
+  },
+  {
+    "url": "https://openreview.net/forum?id=cO1IH43yUF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.19",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper is a study of fine-tuning of BERT contextual representations, with focus on commonly observed instabilities in few-sample scenarios. We identify several factors that cause this instability: the common use of a non-standard optimization method with biased gradient estimation; the limited applicability of significant parts of the BERT network for down-stream tasks; and the prevalent practice of using a pre-determined, and small number of training iterations. We empirically test the impact of these factors, and identify alternative practices that resolve the commonly observed instability of the process. In light of these observations, we re-visit recently proposed methods to improve few-sample fine-tuning with BERT and re-evaluate their effectiveness. Generally, we observe the impact of these methods diminishes significantly with our modified process. ",
+    "title": "Revisiting Few-sample BERT Fine-tuning",
+    "authors": [
+      "Tianyi Zhang",
+      "Felix Wu",
+      "Arzoo Katiyar",
+      "Kilian Q Weinberger",
+      "Yoav Artzi"
+    ],
+    "emails": [
+      "~Yoav_Artzi1",
+      "~Arzoo_Katiyar1",
+      "~Tianyi_Zhang2",
+      "~Felix_Wu1",
+      "~Kilian_Q_Weinberger1"
+    ],
+    "rank": 704
+  },
+  {
+    "url": "https://openreview.net/forum?id=ahAUv8TI2Mz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      7,
+      5
+    ],
+    "rating": "6.18",
+    "confidences": [
+      5,
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "We address the problem of generalized zero-shot learning (GZSL) where the task is to predict the class label of a target image whether its label belongs to the seen or unseen category. Similar to ZSL, the learning setting assumes that all class-level semantic features are given, while only the images of seen classes are available for training. By exploring the correlation between image features and the corresponding semantic features, the main idea of the proposed approach is to enrich the semantic-to-visual (S2V) embeddings via a seamless fusion of adaptive and generative learning. To this end, we extend the semantic features of each class by supplementing image-adaptive attention so that the learned S2V embedding can account for not only inter-class but also intra-class variations. In addition, to break the limit of training with images only from seen classes, we design a generative scheme to simultaneously generate virtual class labels and their visual features by sampling and interpolating over seen counterparts. In inference, a testing image will give rise to two different S2V embeddings, seen and virtual. The former is used to decide whether the underlying label is of the unseen category or otherwise a specific seen class; the latter is to predict an unseen class label. To demonstrate the effectiveness of our method, we report state-of-the-art results on four standard GZSL datasets, including an ablation study of the proposed modules. ",
+    "title": "Adaptive and Generative Zero-Shot Learning",
+    "authors": [
+      "Yu-Ying Chou",
+      "Hsuan-Tien Lin",
+      "Tyng-Luh Liu"
+    ],
+    "emails": [
+      "~Tyng-Luh_Liu1",
+      "~Yu-Ying_Chou1",
+      "~Hsuan-Tien_Lin1"
+    ],
+    "rank": 705
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZcKPWuhG6wy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      6,
+      5
+    ],
+    "rating": "6.18",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Though data augmentation has become a standard component of deep neural network training, the underlying mechanism behind the effectiveness of these techniques remains poorly understood. In practice, augmentation policies are often chosen using heuristics of distribution shift or augmentation diversity. Inspired by these, we conduct an empirical study to quantify how data augmentation improves model generalization. We introduce two interpretable and easy-to-compute measures: Affinity and Diversity. We find that augmentation performance is predicted not by either of these alone but by jointly optimizing the two.",
+    "title": "Tradeoffs in Data Augmentation: An Empirical Study",
+    "authors": [
+      "Raphael Gontijo-Lopes",
+      "Sylvia Smullin",
+      "Ekin Dogus Cubuk",
+      "Ethan Dyer"
+    ],
+    "emails": [
+      "stanfordalumni.org",
+      "~Raphael_Gontijo-Lopes1",
+      "~Ethan_Dyer1",
+      "~Ekin_Dogus_Cubuk1"
+    ],
+    "rank": 706
+  },
+  {
+    "url": "https://openreview.net/forum?id=MG8Zde0ip6u",
+    "decision": "Reject",
+    "ratings": [
+      9,
+      4,
+      5
+    ],
+    "rating": "6.18",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "The raise in popularity of personalized web and mobile applications brings about a need of robust authentication systems. Although password authentication is the most popular authentication mechanism, it has also several drawbacks. Behavioral Biometrics Authentication has emerged as a complementary risk-based authentication approach which aims at profiling users based on their behavior while interacting with computers/smartphones. In this work we propose a novel Siamese Neural Network to perform a few-shot verification of user's behavior. We develop our approach to identify behavior from either human-computer or human-smartphone interaction. For computer interaction our approach learns from mouse and keyboard dynamics, while for smartphone interaction it learns from holding patterns and touch patterns. We show that our approach has a few-shot classification accuracy of up to 99.8% and 90.8% for mobile and web interactions, respectively. We also test our approach on a database that contains over 100K different web interactions collected in the wild.",
+    "title": "A Siamese Neural Network for Behavioral Biometrics Authentication",
+    "authors": [
+      "Jes\u00fas Solano",
+      "Esteban Rivera",
+      "Alejandra Castelblanco",
+      "Lizzy Tengana",
+      "Christian Lopez",
+      "Martin Ochoa"
+    ],
+    "emails": [
+      "~Jes\u00fas_Solano1",
+      "appgate.com"
+    ],
+    "rank": 707
+  },
+  {
+    "url": "https://openreview.net/forum?id=1OCTOShAmqB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      6,
+      8
+    ],
+    "rating": "6.18",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "The attention mechanism has been widely used in deep neural networks as a model component. By now, it has become a critical building block in many state-of-the-art natural language models. Despite its great success established empirically, the working mechanism of attention has not been investigated at a sufficient theoretical depth to date. In this paper, we set up a simple text classification task and study the dynamics of training a simple attention-based classification model using gradient descent. In this setting, we show that, for the discriminative words that the model should attend to, a persisting identity exists relating its embedding and the inner product of its key and the query. This allows us to prove that training must converge to attending to the discriminative words when the attention output is classified by a linear classifier. Experiments are performed, which validate our theoretical analysis and provide further insights.",
+    "title": "On the Dynamics of Training Attention Models",
+    "authors": [
+      "Haoye Lu",
+      "Yongyi Mao",
+      "Amiya Nayak"
+    ],
+    "emails": [
+      "~Haoye_Lu1",
+      "uottawa.ca",
+      "~Yongyi_Mao2"
+    ],
+    "rank": 708
+  },
+  {
+    "url": "https://openreview.net/forum?id=-6vS_4Kfz0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.18",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "For deep neural network accelerators, memory movement is both energetically expensive and can bound computation. Therefore, optimal mapping of tensors to memory hierarchies is critical to performance. The growing complexity of neural networks calls for automated memory mapping instead of manual heuristic approaches; yet the search space of neural network computational graphs have previously been prohibitively large. We introduce Evolutionary Graph Reinforcement Learning (EGRL), a method designed for large search spaces, that combines graph neural networks, reinforcement learning, and evolutionary search. A set of fast, stateless policies guide the evolutionary search to improve its sample-efficiency. We train and validate our approach directly on the Intel NNP-I chip for inference. EGRL outperforms policy-gradient, evolutionary search and dynamic programming baselines on BERT, ResNet-101 and ResNet-50. We additionally achieve 28-78% speed-up compared to the native NNP-I compiler on all three workloads.  ",
+    "title": "Optimizing Memory Placement using Evolutionary Graph Reinforcement Learning",
+    "authors": [
+      "Shauharda Khadka",
+      "Estelle Aflalo",
+      "Mattias Mardar",
+      "Avrech Ben-David",
+      "Santiago Miret",
+      "Shie Mannor",
+      "Tamir Hazan",
+      "Hanlin Tang",
+      "Somdeb Majumdar"
+    ],
+    "emails": [
+      "~Shauharda_Khadka1",
+      "~Tamir_Hazan1",
+      "~Hanlin_Tang1",
+      "~Somdeb_Majumdar1",
+      "~Shie_Mannor2",
+      "intel.com",
+      "campus.technion.ac.il",
+      "~Santiago_Miret1"
+    ],
+    "rank": 709
+  },
+  {
+    "url": "https://openreview.net/forum?id=O3bqkf_Puys",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7
+    ],
+    "rating": "6.17",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Point cloud sequences are irregular and unordered in the spatial dimension while exhibiting regularities and order in the temporal dimension. Therefore, existing grid based convolutions for conventional video processing cannot be directly applied to spatio-temporal modeling of raw point cloud sequences. In this paper, we propose a point spatio-temporal (PST) convolution to achieve informative representations of point cloud sequences. The proposed PST convolution first disentangles space and time in point cloud sequences.  Then, a spatial convolution is employed to capture the local structure of points in the 3D space, and a temporal convolution is used to model the dynamics of the spatial regions along the time dimension.  Furthermore, we incorporate the proposed PST convolution into a deep network, namely PSTNet, to extract features of point cloud sequences in a hierarchical manner.  Extensive experiments on widely-used 3D action recognition and 4D semantic segmentation datasets demonstrate the effectiveness of PSTNet to model point cloud sequences.",
+    "title": "PSTNet: Point Spatio-Temporal Convolution on Point Cloud Sequences",
+    "authors": [
+      "Hehe Fan",
+      "Xin Yu",
+      "Yuhang Ding",
+      "Yi Yang",
+      "Mohan Kankanhalli"
+    ],
+    "emails": [
+      "~Xin_Yu1",
+      "~Yi_Yang4",
+      "~Hehe_Fan1",
+      "~Mohan_Kankanhalli1",
+      "~Yuhang_Ding1"
+    ],
+    "rank": 710
+  },
+  {
+    "url": "https://openreview.net/forum?id=pXmtZdDW16",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.17",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We develop a theory for embedding a random graph using graph neural networks (GNN) and illustrate its capability to solve NP-hard scheduling problems. We apply the theory to address the challenge of developing a near-optimal learning algorithm to solve the NP-hard problem of scheduling multiple robots/machines with time-varying rewards. In particular, we consider a class of reward collection problems called Multi-Robot Reward Collection (MRRC). Such MRRC problems well model ride-sharing, pickup-and-delivery, and a variety of related problems. We consider the classic identical parallel machine scheduling problem (IPMS) in the Appendix.  \n\nFor the theory, we first observe that MRRC system state can be represented as an extension of probabilistic graphical models (PGMs), which we refer to as random PGMs. We then develop a mean-field inference method for random PGMs. \nWe prove that a simple modification of a typical GNN embedding is sufficient to embed a random graph even when the edge presence probabilities are interdependent.\n\nOur theory enables a two-step hierarchical inference for precise and transferable Q-function estimation for MRRC and IPMS. For scalable computation, we show that the transferability of Q-function estimation enables us to design a polynomial-time algorithm with 1-1/e optimality bound. \nExperimental results on solving NP-hard MRRC problems (and IMPS in the Appendix) highlight the near-optimality and transferability of the proposed methods.  \n",
+    "title": "Embedding a random graph via GNN: mean-field inference theory and RL applications to NP-Hard multi-robot/machine scheduling",
+    "authors": [
+      "HYUNWOOK KANG",
+      "SEUNGWOO SCHIN",
+      "James Morrison",
+      "Jinkyoo Park"
+    ],
+    "emails": [
+      "kaist.edu",
+      "gmail.com",
+      "~HYUNWOOK_KANG1",
+      "~Jinkyoo_Park1"
+    ],
+    "rank": 711
+  },
+  {
+    "url": "https://openreview.net/forum?id=MA8eT-vUPvZ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "6.17",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "A fundamental assumption of most machine learning algorithms is that the training and test data are drawn from the same underlying distribution. However, this assumption is violated in almost all practical applications: machine learning systems are regularly tested under distribution shift, due to temporal correlations, particular end users, or other factors. In this work, we consider the setting where the training data are structured into groups and test time shifts correspond to changes in the group distribution. Prior work has approached this problem by attempting to be robust to all possible test time distributions, which may degrade average performance. In contrast, we propose to use ideas from meta-learning to learn models that are adaptable, such that they can adapt to shift at test time using a batch of unlabeled test points. We acquire such models by learning to adapt to training batches sampled according to different distributions, which simulate structural shifts that may occur at test time. Our primary contribution is to introduce the framework of adaptive risk minimization (ARM), a formalization of this setting that lends itself to meta-learning. We develop meta-learning methods for solving the ARM problem, and compared to a variety of prior methods, these methods provide substantial gains on image classification problems in the presence of shift.",
+    "title": "Adaptive Risk Minimization: A Meta-Learning Approach for Tackling Group Shift",
+    "authors": [
+      "Marvin Mengxin Zhang",
+      "Henrik Marklund",
+      "Nikita Dhawan",
+      "Abhishek Gupta",
+      "Sergey Levine",
+      "Chelsea Finn"
+    ],
+    "emails": [
+      "~Marvin_Mengxin_Zhang2",
+      "~Chelsea_Finn1",
+      "cs.stanford.edu",
+      "~Abhishek_Gupta1",
+      "berkeley.edu",
+      "~Sergey_Levine1"
+    ],
+    "rank": 712
+  },
+  {
+    "url": "https://openreview.net/forum?id=IX3Nnir2omJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7
+    ],
+    "rating": "6.17",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Batch Normalization is a key component in almost all state-of-the-art image classifiers, but it also introduces practical challenges: it breaks the independence between training examples within a batch, can incur compute and memory overhead, and often results in unexpected bugs. Building on recent theoretical analyses of deep ResNets at initialization, we propose a simple set of analysis tools to characterize signal propagation on the forward pass, and leverage these tools to design highly performant ResNets without activation normalization layers.  Crucial to our success is an adapted version of the recently proposed Weight Standardization.  Our analysis tools show how this technique preserves the signal in ReLU networks by ensuring that the per-channel activation means do not grow with depth. Across a range of FLOP budgets, our networks attain performance competitive with state-of-the-art EfficientNets on ImageNet.",
+    "title": "Characterizing signal propagation to close the performance gap in unnormalized ResNets",
+    "authors": [
+      "Andrew Brock",
+      "Soham De",
+      "Samuel L Smith"
+    ],
+    "emails": [
+      "~Andrew_Brock1",
+      "~Samuel_L_Smith1",
+      "~Soham_De2"
+    ],
+    "rank": 713
+  },
+  {
+    "url": "https://openreview.net/forum?id=o3iritJHLfO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5,
+      8
+    ],
+    "rating": "6.16",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Although early text-to-speech (TTS) models such as Tacotron 2 have succeeded in generating human-like speech, their autoregressive architectures have several limitations: (1) They require a lot of time to generate a mel-spectrogram consisting of hundreds of steps. (2) The autoregressive speech generation shows a lack of robustness due to its error propagation property. In this paper, we propose a novel non-autoregressive TTS model called BVAE-TTS, which eliminates the architectural limitations and generates a mel-spectrogram in parallel. BVAE-TTS adopts a bidirectional-inference variational autoencoder (BVAE) that learns hierarchical latent representations using both bottom-up and top-down paths to increase its expressiveness. To apply BVAE to TTS, we design our model to utilize text information via an attention mechanism. By using attention maps that BVAE-TTS generates, we train a duration predictor so that the model uses the predicted duration of each phoneme at inference. In experiments conducted on LJSpeech dataset, we show that our model generates a mel-spectrogram 27 times faster than Tacotron 2 with similar speech quality. Furthermore, our BVAE-TTS outperforms Glow-TTS, which is one of the state-of-the-art non-autoregressive TTS models, in terms of both speech quality and inference speed while having 58% fewer parameters.",
+    "title": "Bidirectional Variational Inference for Non-Autoregressive Text-to-Speech",
+    "authors": [
+      "Yoonhyung Lee",
+      "Joongbo Shin",
+      "Kyomin Jung"
+    ],
+    "emails": [
+      "~Yoonhyung_Lee2",
+      "~Joongbo_Shin1",
+      "~Kyomin_Jung1"
+    ],
+    "rank": 714
+  },
+  {
+    "url": "https://openreview.net/forum?id=xzqLpqRzxLq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      8,
+      5
+    ],
+    "rating": "6.16",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The need of collecting large quantities of labeled training data for each new task has limited the usefulness of deep neural networks. Given data from a set of source tasks, this limitation can be overcome using two transfer learning approaches: few-shot learning (FSL) and self-supervised learning (SSL). The former aims to learn `how to learn' by designing learning episodes using source tasks to simulate the challenge of solving the target new task with few labeled samples. In contrast, the latter exploits an annotation-free pretext task across all source tasks in order to learn generalizable feature representations. In this work, we propose a novel Instance-level and Episode-level Pretext Task (IEPT) framework that seamlessly integrates SSL into FSL. Specifically, given an FSL episode, we first apply geometric transformations to each instance to generate extended episodes. At the instance-level, transformation recognition is performed as per standard SSL. Importantly, at the episode-level, two SSL-FSL hybrid learning objectives are devised: (1) The consistency across the predictions of an FSL classifier from different extended episodes is maximized as an episode-level pretext task. (2) The features extracted from each instance across different episodes are integrated to construct a single FSL classifier for meta-learning. Extensive experiments show that our proposed model (i.e., FSL with IEPT) achieves the new state-of-the-art. ",
+    "title": "IEPT: Instance-Level and Episode-Level Pretext Tasks for Few-Shot Learning",
+    "authors": [
+      "Manli Zhang",
+      "Jianhong Zhang",
+      "Zhiwu Lu",
+      "Tao Xiang",
+      "Mingyu Ding",
+      "Songfang Huang"
+    ],
+    "emails": [
+      "~Songfang_Huang1",
+      "~Jianhong_Zhang1",
+      "~Zhiwu_Lu1",
+      "~Mingyu_Ding1",
+      "~Manli_Zhang1",
+      "~Tao_Xiang1"
+    ],
+    "rank": 715
+  },
+  {
+    "url": "https://openreview.net/forum?id=AjrRA6WYSW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.15",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Most community detection algorithms assume the number of communities, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>, to be known \\textit{a priori}. Among various approaches that have been proposed to estimate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>, the non-parametric methods based on the spectral properties of the Bethe Hessian matrices have garnered much popularity for their simplicity, computational efficiency, and robust performance irrespective of the sparsity of the input data. Recently, one such method has been shown to estimate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> consistently if the input network is generated from the (semi-dense) stochastic block model, when the average of the expected degrees (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.177em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>d</mi><mo stretchy=\"false\">~</mo></mover></mrow></math></mjx-assistive-mml></mjx-container>) of all the nodes in the network satisfies <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.177em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>d</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo>\u226b</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mi>N</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>  (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> being the number of nodes in the network). In this paper, we prove some finite sample results that hold for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.177em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>d</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo>=</mo><mi>o</mi><mo stretchy=\"false\">(</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mi>N</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, which in turn show that the estimation of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> based on the spectra of the Bethe Hessian matrices is consistent not only for the semi-dense regime, but also for the sub-logarithmic sparse regime when <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.177em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mo>\u226a</mo><mrow><mover><mi>d</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo>\u226a</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mi>N</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. Thus, our estimation procedure is a robust method for a wide range of problem settings, regardless of the sparsity of the network input.",
+    "title": "Estimation of Number of Communities in Assortative Sparse Networks",
+    "authors": [
+      "Neil Hwang",
+      "Jiarui Xu",
+      "Shirshendu Chatterjee",
+      "Sharmodeep Bhattacharyya"
+    ],
+    "emails": [
+      "ccny.cuny.edu",
+      "oregonstate.edu",
+      "gmail.com",
+      "~Sharmodeep_Bhattacharyya1"
+    ],
+    "rank": 716
+  },
+  {
+    "url": "https://openreview.net/forum?id=PQlC91XxqK5",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7
+    ],
+    "rating": "6.15",
+    "confidences": [
+      5,
+      3,
+      5
+    ],
+    "abstract": "In this work, we present Lexical Unit Analysis (LUA), a framework for general sequence segmentation tasks. Given a natural language sentence, LUA scores all the valid segmentation candidates and utilizes dynamic programming (DP) to extract the maximum scoring one. LUA enjoys a number of appealing properties such as inherently guaranteeing the predicted segmentation to be valid and facilitating globally optimal training and inference. Besides, the practical time complexity of LUA can be reduced to linear time, which is very efficient. We have conducted extensive experiments on 5 tasks, including syntactic chunking, named entity recognition (NER), slot filling, Chinese word segmentation, and Chinese part-of-speech (POS) tagging, across 15 datasets. Our models have achieved the state-of-the-art performances on 13 of them. The results also show that the F1 score of identifying long-length segments is notably improved.",
+    "title": "Segmenting Natural Language Sentences via Lexical Unit Analysis",
+    "authors": [
+      "Yangming Li",
+      "lemao liu",
+      "Shuming Shi"
+    ],
+    "emails": [
+      "~Yangming_Li1",
+      "~lemao_liu1",
+      "~Shuming_Shi1"
+    ],
+    "rank": 717
+  },
+  {
+    "url": "https://openreview.net/forum?id=udbMZR1cKE6",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.15",
+    "confidences": [
+      3,
+      1,
+      3,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we consider the problem of leveraging textual descriptions to improve generalization of control policies to new scenarios. Unlike prior work in this space, we do not assume access to any form of prior knowledge connecting text and state observations, and learn both symbol grounding and control policy simultaneously. This is challenging due to a lack of concrete supervision, and incorrect groundings can result in worse performance than policies that do not use the text at all. \nWe develop a new model, EMMA (Entity Mapper with Multi-modal Attention) which uses a multi-modal entity-conditioned attention module that allows for selective focus over relevant sentences in the manual for each entity in the environment. EMMA is end-to-end differentiable and can learn a latent grounding of entities and dynamics from text to observations using environment rewards as the only source of supervision.\nTo empirically test our model, we design a new framework of 1320 games and collect text manuals with free-form natural language via crowd-sourcing. We demonstrate that EMMA achieves successful zero-shot generalization to unseen games with new dynamics, obtaining significantly higher rewards compared to multiple baselines. The grounding acquired by EMMA is also robust to noisy descriptions and linguistic variation.",
+    "title": "Grounding Language to Entities for Generalization in Reinforcement Learning",
+    "authors": [
+      "H. J. Austin Wang",
+      "Karthik R Narasimhan"
+    ],
+    "emails": [
+      "~H._J._Austin_Wang1",
+      "~Karthik_R_Narasimhan1"
+    ],
+    "rank": 718
+  },
+  {
+    "url": "https://openreview.net/forum?id=snOgiCYZgJ7",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.15",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Our work is concerned with the generation and targeted design of RNA, a type of genetic macromolecule that can adopt complex structures which influence their cellular activities and functions. The design of large scale and complex biological structures spurs dedicated graph-based deep generative modeling techniques, which represents a key but underappreciated aspect of computational drug discovery. In this work, we investigate the principles behind representing and generating different RNA structural modalities, and propose a flexible framework to jointly embed and generate these molecular structures along with their sequence in a meaningful latent space. Equipped with a deep understanding of RNA molecular structures, our most sophisticated encoding and decoding methods operate on the molecular graph as well as the junction tree hierarchy, integrating strong inductive bias about RNA structural regularity and folding mechanism such that high structural validity, stability and diversity of generated RNAs are achieved. Also, we seek to adequately organize the latent space of RNA molecular embeddings with regard to the interaction with proteins, and targeted optimization is used to navigate in this latent space to search for desired novel RNA molecules.",
+    "title": "Neural representation and generation for RNA secondary structures",
+    "authors": [
+      "Zichao Yan",
+      "William L. Hamilton",
+      "Mathieu Blanchette"
+    ],
+    "emails": [
+      "~Mathieu_Blanchette1",
+      "~William_L._Hamilton1",
+      "~Zichao_Yan1"
+    ],
+    "rank": 719
+  },
+  {
+    "url": "https://openreview.net/forum?id=TgSVWXw22FQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.15",
+    "confidences": [
+      2,
+      1,
+      5,
+      5
+    ],
+    "abstract": "Voice style transfer, also called voice conversion, seeks to modify one speaker's voice to generate speech as if it came from another (target) speaker. Previous works have made progress on voice conversion with parallel training data and pre-known speakers. However, zero-shot voice style transfer, which learns from non-parallel data and generates voices for previously unseen speakers, remains a challenging problem. In this paper we propose a novel zero-shot voice transfer method via disentangled representation learning. The proposed method first encodes speaker-related style and voice content of each input voice into separate low-dimensional embedding spaces, and then transfers to a new voice by combining the source content embedding and target style embedding through a decoder. With information-theoretic guidance, the style and content embedding spaces are representative and (ideally) independent of each other. On real-world datasets, our method outperforms other baselines and obtains state-of-the-art results in terms of transfer accuracy and voice naturalness.",
+    "title": "Improving Zero-Shot Voice Style Transfer via Disentangled Representation Learning",
+    "authors": [
+      "Siyang Yuan",
+      "Pengyu Cheng",
+      "Ruiyi Zhang",
+      "Weituo Hao",
+      "Zhe Gan",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "~Pengyu_Cheng1",
+      "~Siyang_Yuan1",
+      "~Zhe_Gan1",
+      "~Ruiyi_Zhang3",
+      "~Lawrence_Carin2",
+      "~Weituo_Hao1"
+    ],
+    "rank": 720
+  },
+  {
+    "url": "https://openreview.net/forum?id=xsx58rmaW2p",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      8,
+      5,
+      5
+    ],
+    "rating": "6.15",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We propose a new metric (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence) to experimentally study the alignment of per-example gradients during training. Intuitively, given a sample of size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence is the number of examples in the sample that benefit from a small step along the gradient of any one example on average. We show that compared to other commonly used metrics, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence is more interpretable, cheaper to compute (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>m</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> instead of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>m</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>) and mathematically cleaner. (We note that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence is closely connected to gradient diversity, a quantity previously used in some theoretical bounds.) Using <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence, we study the evolution of alignment of per-example gradients in ResNet and EfficientNet models on ImageNet and several variants with label noise, particularly from the perspective of the recently proposed Coherent Gradients (CG) theory that provides a simple, unified explanation for memorization and generalization [Chatterjee, ICLR 20]. Although we have several interesting takeaways, our most surprising result concerns memorization. Naively, one might expect that when training with completely random labels, each example is fitted independently, and so <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence should be close to 1. However, this is not the case: <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>-coherence reaches moderately high values during training (though still much smaller than real labels), indicating that over-parameterized neural networks find common patterns even in scenarios where generalization is not possible. A detailed analysis of this phenomenon provides both a deeper confirmation of CG, but at the same point puts into sharp relief what is missing from the theory in order to provide a complete explanation of generalization in neural networks.",
+    "title": "Making Coherence Out of Nothing At All: Measuring Evolution of Gradient Alignment",
+    "authors": [
+      "Satrajit Chatterjee",
+      "Piotr Zielinski"
+    ],
+    "emails": [
+      "google.com",
+      "~Satrajit_Chatterjee1"
+    ],
+    "rank": 721
+  },
+  {
+    "url": "https://openreview.net/forum?id=ATp1nW2FuZL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      5,
+      5
+    ],
+    "rating": "6.15",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Recent research has proposed neural architectures for solving combinatorial problems in structured output spaces. In many such problems, there may exist multiple solutions for a given input, e.g. a partially filled Sudoku puzzle may have many completions satisfying all constraints. Further, we are often interested in finding any \"one\" of the possible solutions, without any preference between them. Existing approaches completely ignore this solution multiplicity. In this paper, we argue that being oblivious to the presence of multiple solutions can severely hamper their training ability. Our contribution is two-fold. First, we formally define the task of learning one-of-many solutions for combinatorial problems in structured output spaces, which is applicable for solving several problems of interest such as N-Queens, and Sudoku. Second, we present a generic learning framework that adapts an existing prediction network for a combinatorial problem to handle solution multiplicity. Our framework uses a selection module, whose goal is to dynamically determine, for every input, the solution that is most effective for training the network parameters in any given learning iteration. We propose an RL based approach to jointly train the selection module with the prediction network. Experiments on three different domains, and using two different prediction networks,  demonstrate that our framework significantly improves the accuracy in our setting, obtaining up to 21 pt gain over the baselines.\n",
+    "title": "Neural Learning of One-of-Many Solutions for Combinatorial Problems in Structured Output Spaces",
+    "authors": [
+      "Yatin Nandwani",
+      "Deepanshu Jindal",
+      "Mausam .",
+      "Parag Singla"
+    ],
+    "emails": [
+      "~Yatin_Nandwani1",
+      "gmail.com",
+      "~Mausam_.1",
+      "~Parag_Singla1"
+    ],
+    "rank": 722
+  },
+  {
+    "url": "https://openreview.net/forum?id=MIDckA56aD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      5
+    ],
+    "rating": "6.15",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Although much progress has been made towards robust deep learning, a significant gap in robustness remains between real-world perturbations and more narrowly defined sets typically studied in adversarial defenses. In this paper, we aim to bridge this gap by learning perturbation sets from data, in order to characterize real-world effects for robust training and evaluation. Specifically, we use a conditional generator that defines the perturbation set over a constrained region of the latent space. We formulate desirable properties that measure the quality of a learned perturbation set, and theoretically prove that a conditional variational autoencoder naturally satisfies these criteria. Using this framework, our approach can generate a variety of perturbations at different complexities and scales, ranging from baseline spatial transformations, through common image corruptions, to lighting variations. We measure the quality of our learned perturbation sets both quantitatively and qualitatively, finding that our models are capable of producing a diverse set of meaningful perturbations beyond the limited data seen during training. Finally, we leverage our learned perturbation sets to train models which are empirically and certifiably robust to adversarial image corruptions and adversarial lighting variations, while improving generalization on non-adversarial data. All code and configuration files for reproducing the experiments as well as pretrained model weights can be found at <a href=\"https://github.com/locuslab/perturbation_learning\" target=\"_blank\" rel=\"nofollow\">https://github.com/locuslab/perturbation_learning</a>. ",
+    "title": "Learning perturbation sets for robust machine learning",
+    "authors": [
+      "Eric Wong",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~J_Zico_Kolter1",
+      "~Eric_Wong1"
+    ],
+    "rank": 723
+  },
+  {
+    "url": "https://openreview.net/forum?id=90JprVrJBO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.15",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Methods for automatically learning to solve routing problems are rapidly improving in performance. While most of these methods excel at generating solutions quickly, they are unable to effectively utilize longer run times because they lack a sophisticated search component. We present a learning-based optimization approach that allows a guided search in the distribution of high-quality solutions for a problem instance. More precisely, our method uses a conditional variational autoencoder that learns to map points in a continuous (latent) search space to high-quality, instance-specific routing problem solutions. The learned space can then be searched by any unconstrained continuous optimization method. We show that even using a standard differential evolution search strategy our approach is able to outperform existing purely machine learning based approaches. ",
+    "title": "Learning a Latent Search Space for Routing Problems using Variational Autoencoders",
+    "authors": [
+      "Andr\u00e9 Hottung",
+      "Bhanu Bhandari",
+      "Kevin Tierney"
+    ],
+    "emails": [
+      "~Kevin_Tierney1",
+      "~Andr\u00e9_Hottung1",
+      "cs.umass.edu"
+    ],
+    "rank": 724
+  },
+  {
+    "url": "https://openreview.net/forum?id=TR-Nj6nFx42",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.15",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we derive generalization bounds for two primary classes of graph neural networks (GNNs), namely graph convolutional networks (GCNs) and message passing GNNs (MPGNNs), via a PAC-Bayesian approach. Our result reveals that the maximum node degree and the spectral norm of the weights govern the generalization bounds of both models. We also show that our bound for GCNs is a natural generalization of the results developed in \\citep{neyshabur2017pac} for fully-connected and convolutional neural networks. For MPGNNs, our PAC-Bayes bound improves over the Rademacher complexity based bound \\citep{garg2020generalization}, showing a tighter dependency on the maximum node degree and the maximum hidden dimension. The key ingredients of our proofs are a perturbation analysis of GNNs and the generalization of PAC-Bayes analysis to non-homogeneous GNNs. We perform an empirical study on several synthetic and real-world graph datasets and verify that our PAC-Bayes bound is tighter than others. ",
+    "title": "A PAC-Bayesian Approach to Generalization Bounds for Graph Neural Networks",
+    "authors": [
+      "Renjie Liao",
+      "Raquel Urtasun",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Renjie_Liao1",
+      "~Raquel_Urtasun1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 725
+  },
+  {
+    "url": "https://openreview.net/forum?id=23ZjUGpjcc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.15",
+    "confidences": [
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Transfer of pre-trained representations can improve sample efficiency and reduce computational requirements for new tasks. However, representations used for transfer are usually generic, and are not tailored to a particular distribution of downstream tasks. We explore the use of expert representations for transfer with a simple, yet effective, strategy. We train a diverse set of experts by exploiting existing label structures, and use cheap-to-compute performance proxies to select the relevant expert for each target task. This strategy scales the process of transferring to new tasks, since it does not revisit the pre-training data during transfer. Accordingly, it requires little extra compute per target task, and results in a speed-up of 2-3 orders of magnitude compared to competing approaches. Further, we provide an adapter-based architecture able to compress many experts into a single model. We evaluate our approach on two different data sources and demonstrate that it outperforms baselines on over 20 diverse vision tasks in both cases.",
+    "title": "Scalable Transfer Learning with Expert Models",
+    "authors": [
+      "Joan Puigcerver",
+      "Carlos Riquelme Ruiz",
+      "Basil Mustafa",
+      "Cedric Renggli",
+      "Andr\u00e9 Susano Pinto",
+      "Sylvain Gelly",
+      "Daniel Keysers",
+      "Neil Houlsby"
+    ],
+    "emails": [
+      "~Cedric_Renggli1",
+      "~Joan_Puigcerver1",
+      "~Andr\u00e9_Susano_Pinto1",
+      "~Neil_Houlsby1",
+      "~Carlos_Riquelme_Ruiz1",
+      "google.com",
+      "~Sylvain_Gelly1",
+      "~Daniel_Keysers2"
+    ],
+    "rank": 726
+  },
+  {
+    "url": "https://openreview.net/forum?id=9QLRCVysdlO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      7,
+      7
+    ],
+    "rating": "6.14",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "To alleviate the resource constraint for real-time point cloud applications that run on edge devices, in this paper we present BiPointNet, the first model binarization approach for efficient deep learning on point clouds. We discover that the immense performance drop of binarized models for point clouds mainly stems from two challenges: aggregation-induced feature homogenization that leads to a degradation of information entropy, and scale distortion that hinders optimization and invalidates scale-sensitive structures. With theoretical justifications and in-depth analysis, our BiPointNet introduces Entropy-Maximizing Aggregation (EMA) to modulate the distribution before aggregation for the maximum information entropy, and Layer-wise Scale Recovery (LSR) to efficiently restore feature representation capacity. Extensive experiments show that BiPointNet outperforms existing binarization methods by convincing margins, at the level even comparable with the full precision counterpart. We highlight that our techniques are generic, guaranteeing significant improvements on various fundamental tasks and mainstream backbones. Moreover, BiPointNet gives an impressive 14.7\u00d7 speedup and 18.9\u00d7 storage saving on real-world resource-constrained devices.",
+    "title": "BiPointNet: Binary Neural Network for Point Clouds",
+    "authors": [
+      "Haotong Qin",
+      "Zhongang Cai",
+      "Mingyuan Zhang",
+      "Yifu Ding",
+      "Haiyu Zhao",
+      "Shuai Yi",
+      "Xianglong Liu",
+      "Hao Su"
+    ],
+    "emails": [
+      "~Xianglong_Liu2",
+      "~Shuai_Yi3",
+      "~Yifu_Ding2",
+      "~Mingyuan_Zhang1",
+      "buaa.edu.cn",
+      "~Hao_Su1",
+      "~Haiyu_Zhao1",
+      "~Zhongang_Cai1"
+    ],
+    "rank": 727
+  },
+  {
+    "url": "https://openreview.net/forum?id=M_KwRsbhi5e",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      7,
+      7,
+      4
+    ],
+    "rating": "6.14",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Branch-and-Bound~(B\\&amp;B) is a general and widely used algorithm paradigm for solving Mixed Integer Programming~(MIP). \nRecently there is a surge of interest in designing learning-based branching policies as a fast approximation of strong branching, a human-designed heuristic. In this work, we argue strong branching is not a good expert to imitate for its poor decision quality when turning off its side effects in solving linear programming. To obtain more effective and non-myopic policies than a local heuristic, we formulate the branching process in MIP as reinforcement learning~(RL) and design a policy characterization for the B\\&amp;B process to improve our agent by novelty search evolutionary strategy. Across a range of NP-hard problems, our trained RL agent significantly outperforms expert-designed branching rules and the state-of-the-art learning-based branching methods in terms of both speed and effectiveness. Our results suggest that with carefully designed policy networks and learning algorithms, reinforcement learning has the potential to advance algorithms for solving MIPs.",
+    "title": "Improving Learning to Branch via Reinforcement Learning",
+    "authors": [
+      "Haoran Sun",
+      "Wenbo Chen",
+      "Hui Li",
+      "Le Song"
+    ],
+    "emails": [
+      "~Le_Song1",
+      "~Wenbo_Chen2",
+      "~Haoran_Sun2",
+      "~Hui_Li2"
+    ],
+    "rank": 728
+  },
+  {
+    "url": "https://openreview.net/forum?id=_mQp5cr_iNy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5
+    ],
+    "rating": "6.14",
+    "confidences": [
+      2,
+      2,
+      3
+    ],
+    "abstract": "Despite definite success in deep reinforcement learning problems, actor-critic algorithms are still confronted with sample inefficiency in complex environments, particularly in tasks where efficient exploration is a bottleneck. These methods consider a policy (the actor) and a value function (the critic) whose respective losses are built using different motivations and approaches. This paper introduces a third protagonist: the adversary. While the adversary mimics the actor by minimizing the KL-divergence between their respective action distributions, the actor, in addition to learning to solve the task, tries to differentiate itself from the adversary predictions. This novel objective stimulates the actor to follow strategies that could not have been correctly predicted from previous trajectories, making its behavior innovative in tasks where the reward is extremely rare. Our experimental analysis shows that the resulting Adversarially Guided Actor-Critic (AGAC) algorithm leads to more exhaustive exploration. Notably, AGAC outperforms current state-of-the-art methods on a set of various hard-exploration and procedurally-generated tasks.",
+    "title": "Adversarially Guided Actor-Critic",
+    "authors": [
+      "Yannis Flet-Berliac",
+      "Johan Ferret",
+      "Olivier Pietquin",
+      "Philippe Preux",
+      "Matthieu Geist"
+    ],
+    "emails": [
+      "~Matthieu_Geist1",
+      "~Yannis_Flet-Berliac1",
+      "~Johan_Ferret1",
+      "~Philippe_Preux1",
+      "~Olivier_Pietquin1"
+    ],
+    "rank": 729
+  },
+  {
+    "url": "https://openreview.net/forum?id=5wmNjjvGOXh",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      7,
+      4
+    ],
+    "rating": "6.14",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Sparse neural networks have been widely applied to reduce the necessary resource requirements to train and deploy over-parameterized deep neural networks. For inference acceleration, methods that induce sparsity from a pre-trained dense network (dense-to-sparse) work effectively. Recently, dynamic sparse training (DST) has been proposed to train sparse neural networks without pre-training a large and dense network (sparse-to-sparse), so that the training process can also be accelerated. However, previous sparse-to-sparse methods mainly focus on Multilayer Perceptron Networks (MLPs) and Convolutional Neural Networks (CNNs), failing to match the performance of dense-to-sparse methods in Recurrent Neural Networks (RNNs) setting. In this paper, we propose an approach to train sparse RNNs with a fixed parameter count in one single run, without compromising performance. During training, we allow RNN layers to have a non-uniform redistribution across cell weights for a better regularization. Further, we introduce SNT-ASGD, a variant of the averaged stochastic gradient optimizer, which significantly improves the performance of all sparse training methods for RNNs. Using these strategies, we achieve state-of-the-art sparse training results, even better than dense model results, with various types of RNNs on Penn TreeBank and Wikitext-2 datasets.",
+    "title": "Selfish Sparse RNN Training",
+    "authors": [
+      "SHiwei Liu",
+      "Decebal Constantin Mocanu",
+      "Yulong Pei",
+      "Mykola Pechenizkiy"
+    ],
+    "emails": [
+      "~Decebal_Constantin_Mocanu1",
+      "~SHiwei_Liu1",
+      "~Mykola_Pechenizkiy1",
+      "~Yulong_Pei1"
+    ],
+    "rank": 730
+  },
+  {
+    "url": "https://openreview.net/forum?id=mgVbI13p96",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      6
+    ],
+    "rating": "6.13",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In the image domain, excellent representation can be learned by inducing invariance to content-preserving transformations, such as image distortions. In this paper, we show that, for videos, the answer is more complex, and that better results can be obtained by accounting for the interplay between invariance, distinctiveness, multiple modalities and time.  We introduce Generalized Data Transformations (GDTs) as a way to capture this interplay. GDTs reduce most previous self-supervised approaches to a choice of data transformations, even when this was not the case in the original formulations. They also allow to choose whether the representation should be invariant or distinctive w.r.t. each effect and tell which combinations are valid, thus allowing us to explore the space of combinations systematically. We show in this manner that being invariant to certain transformations and distinctive to others is critical to learning effective video representations, improving the state-of-the-art by a large margin, and even surpassing supervised pretraining. We demonstrate results on a variety of downstream video and audio classification and retrieval tasks, on datasets such as HMDB-51, UCF-101, DCASE2014, ESC-50 and VGG-Sound. In particular, we achieve new state-of-the-art accuracies of 72.8% on HMDB-51 and 95.2% on UCF-101.",
+    "title": "Multi-modal Self-Supervision from Generalized Data Transformations",
+    "authors": [
+      "Mandela Patrick",
+      "Yuki Asano",
+      "Polina Kuznetsova",
+      "Ruth Fong",
+      "Joao F. Henriques",
+      "Geoffrey Zweig",
+      "Andrea Vedaldi"
+    ],
+    "emails": [
+      "~Joao_F._Henriques1",
+      "~Andrea_Vedaldi1",
+      "~Ruth_Fong1",
+      "~Yuki_Asano1",
+      "fb.com",
+      "~Mandela_Patrick1"
+    ],
+    "rank": 731
+  },
+  {
+    "url": "https://openreview.net/forum?id=TaYhv-q1Xit",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      4,
+      5,
+      8
+    ],
+    "rating": "6.13",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we apply harmonic distortion analysis to understand the effect of nonlinearities in the spectral domain. Each nonlinear layer creates higher-frequency harmonics, which we call \"blueshift\", whose magnitude increases with network depth, thereby increasing the \u201croughness\u201d of the output landscape. Unlike differential models (such as vanishing gradients, sharpness), this provides a more global view of how network architectures behave across larger areas of their parameter domain. For example, the model predicts that residual connections are able to counter the effect by dampening corresponding higher frequency modes. We empirically verify the connection between blueshift and architectural choices, and provide evidence for a connection with trainability.",
+    "title": "Ringing ReLUs: Harmonic Distortion Analysis of Nonlinear Feedforward Networks",
+    "authors": [
+      "Christian H.X. Ali Mehmeti-G\u00f6pel",
+      "David Hartmann",
+      "Michael Wand"
+    ],
+    "emails": [
+      "~David_Hartmann1",
+      "uni-mainz.de",
+      "~Christian_H.X._Ali_Mehmeti-G\u00f6pel1"
+    ],
+    "rank": 732
+  },
+  {
+    "url": "https://openreview.net/forum?id=6k7VdojAIK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      8,
+      7,
+      3
+    ],
+    "rating": "6.13",
+    "confidences": [
+      4,
+      2,
+      3,
+      3,
+      3
+    ],
+    "abstract": "It is common practice to use large computational resources to train neural networks, known from many examples, such as reinforcement learning applications. However, while massively parallel computing is often used for training models, it is rarely used to search solutions for combinatorial optimization problems. This paper proposes a novel massively parallel Monte-Carlo Tree Search (MP-MCTS) algorithm that works efficiently for a 1,000 worker scale on a distributed memory environment using multiple compute nodes and applies it to molecular design. This paper is the first work that applies distributed MCTS to a real-world and non-game problem. Existing works on large-scale parallel MCTS show efficient scalability in terms of the number of rollouts up to 100 workers. Still, they suffer from the degradation in the quality of the solutions. MP-MCTS maintains the search quality at a larger scale. By running MP-MCTS on 256 CPU cores for only 10 minutes, we obtained candidate molecules with similar scores to non-parallel MCTS running for 42 hours. Moreover, our results based on parallel MCTS (combined with a simple RNN model) significantly outperform existing state-of-the-art work. Our method is generic and is expected to speed up other applications of MCTS.",
+    "title": "Practical Massively Parallel Monte-Carlo Tree Search Applied to Molecular Design",
+    "authors": [
+      "Xiufeng Yang",
+      "Tanuj Aasawat",
+      "Kazuki Yoshizoe"
+    ],
+    "emails": [
+      "~Xiufeng_Yang1",
+      "~Kazuki_Yoshizoe2",
+      "~Tanuj_Aasawat1"
+    ],
+    "rank": 733
+  },
+  {
+    "url": "https://openreview.net/forum?id=2V1ATRzaZQU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      9,
+      7
+    ],
+    "rating": "6.13",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "As multi-agent systems proliferate in machine learning research, games have attracted much attention as a framework to understand optimization of multiple interacting objectives. However, a key challenge in game optimization is that, in general, there is no guarantee for usual gradient-based methods to converge to a local solution of the game. The latest work by Chavdarova et al. (2020) report that Lookahead optimizer (Zhang et al. (2019)) significantly improves the performance of Generative Adversarial Networks (GANs) and reduces the rotational force of bilinear games. While promising, their observations were purely empirical, and Lookahead optimization of smooth games still lacks theoretical understanding. In this paper, we fill this gap by theoretically characterizing Lookahead dynamics of smooth games. We provide an intuitive geometric explanation on how and when Lookahead can improve game dynamics in terms of stability and convergence. Furthermore, we present sufficient conditions under which Lookahead optimization of bilinear games provably stabilizes or accelerates convergence to a Nash equilibrium of the game. Finally, we show that Lookahead optimizer preserves locally asymptotically stable equilibria of base dynamics and can either stabilize or accelerate the local convergence to a given equilibrium with proper assumptions. We verify each of our theoretical predictions by conducting numerical experiments on two-player zero-sum (non-linear) games.",
+    "title": "Characterizing Lookahead Dynamics of Smooth Games",
+    "authors": [
+      "Junsoo Ha",
+      "Gunhee Kim"
+    ],
+    "emails": [
+      "~Junsoo_Ha1",
+      "~Gunhee_Kim1"
+    ],
+    "rank": 734
+  },
+  {
+    "url": "https://openreview.net/forum?id=c7rtqjVaWiE",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      5,
+      7
+    ],
+    "rating": "6.13",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recently, sampling methods have been successfully applied to enhance the sample quality of Generative Adversarial Networks (GANs). However, in practice, they typically have poor sample efficiency because of the independent proposal sampling from the generator. In this work, we propose REP-GAN, a novel sampling method that allows general dependent proposals by REParameterizing the Markov chains into the latent space of the generator. Theoretically, we show that our reparameterized proposal admits a closed-form Metropolis-Hastings acceptance ratio. Empirically, extensive experiments on synthetic and real datasets demonstrate that our REP-GAN largely improves the sample efficiency and obtains better sample quality simultaneously.",
+    "title": "Efficient Sampling for Generative Adversarial Networks with Reparameterized Markov Chains",
+    "authors": [
+      "Yifei Wang",
+      "Yisen Wang",
+      "Jiansheng Yang",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "math.pku.edu.cn",
+      "~Yisen_Wang1",
+      "~Zhouchen_Lin1",
+      "~Yifei_Wang1"
+    ],
+    "rank": 735
+  },
+  {
+    "url": "https://openreview.net/forum?id=QkRbdiiEjM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.13",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "The design of deep graph models still remains to be investigated and the crucial part is how to explore and exploit the knowledge from different hops of neighbors in an efficient way. In this paper, we propose a novel RNN-like deep graph neural network architecture by incorporating AdaBoost into the computation of network; and the proposed graph convolutional network called AdaGCN~(Adaboosting Graph Convolutional Network) has the ability to efficiently extract knowledge from high-order neighbors of current nodes and then integrates knowledge from different hops of neighbors into the network in an Adaboost way. Different from other graph neural networks that directly stack many graph convolution layers, AdaGCN shares the same base neural network architecture among all ``layers'' and is recursively optimized, which is similar to an RNN. Besides, We also theoretically established the connection between AdaGCN and existing graph convolutional methods, presenting the benefits of our proposal. Finally, extensive experiments demonstrate the consistent state-of-the-art prediction performance on graphs across different label rates and the computational advantage of our approach AdaGCN~\\footnote{Code is available at \\url{<a href=\"https://github.com/datake/AdaGCN}.}\" target=\"_blank\" rel=\"nofollow\">https://github.com/datake/AdaGCN}.}</a>.",
+    "title": "AdaGCN: Adaboosting Graph Convolutional Networks into Deep Models",
+    "authors": [
+      "Ke Sun",
+      "Zhanxing Zhu",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Zhouchen_Lin1",
+      "~Zhanxing_Zhu1",
+      "~Ke_Sun3"
+    ],
+    "rank": 736
+  },
+  {
+    "url": "https://openreview.net/forum?id=ohdw3t-8VCY",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7,
+      6
+    ],
+    "rating": "6.12",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Current summarization systems yield generic summaries that are disconnected from users' preferences and expectations. To address this limitation, we present CTRLsum, a novel framework for controllable summarization. Our approach enables users to control multiple aspects of generated summaries by interacting with the summarization system through textual input in the form of a set of keywords or descriptive prompts. Using a single unified model, CTRLsum is able to achieve a broad scope of summary manipulation at inference time without requiring additional human annotations or pre-defining a set of control aspects during training. We quantitatively demonstrate the effectiveness of our approach on three domains of summarization datasets and five control aspects: 1) entity-centric and 2) length-controllable summarization, 3) contribution summarization on scientific papers, 4) invention purpose summarization on patent filings, and 5) question-guided summarization on news articles in a reading comprehension setting. Moreover, when used in a standard, uncontrolled summarization setting, CTRLsum achieves state-of-the-art results on the CNN/DailyMail dataset.",
+    "title": "CTRLsum: Towards Generic Controllable Text Summarization",
+    "authors": [
+      "Junxian He",
+      "Wojciech Maciej Kryscinski",
+      "Bryan McCann",
+      "Nazneen Rajani",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Nazneen_Rajani1",
+      "~Bryan_McCann1",
+      "~Wojciech_Maciej_Kryscinski1",
+      "~Junxian_He1",
+      "~Caiming_Xiong1"
+    ],
+    "rank": 737
+  },
+  {
+    "url": "https://openreview.net/forum?id=01olnfLIbD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.12",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Data Poisoning attacks modify training data to maliciously control a model trained on such data.\nIn this work, we focus on targeted poisoning attacks which cause a reclassification of an unmodified test image and as such breach model integrity. We consider a\nparticularly malicious poisoning attack that is both ``from scratch\" and ``clean label\", meaning we analyze an attack that successfully works against new, randomly initialized models, and is nearly imperceptible to humans, all while perturbing only a small fraction of the training data. \nPrevious poisoning attacks against deep neural networks in this setting have been limited in scope and success, working only in simplified settings or being prohibitively expensive for large datasets.\nThe central mechanism of the new attack is matching the gradient direction of malicious examples. We analyze why this works, supplement with practical considerations. and show its threat to real-world practitioners, finding that it is the first poisoning method to cause targeted misclassification in modern deep networks trained from scratch on a full-sized, poisoned ImageNet dataset.\nFinally we demonstrate the limitations of existing defensive strategies against such an attack, concluding that data poisoning is a credible threat, even for large-scale deep learning systems.",
+    "title": "Witches' Brew: Industrial Scale Data Poisoning via Gradient Matching",
+    "authors": [
+      "Jonas Geiping",
+      "Liam H Fowl",
+      "W. Ronny Huang",
+      "Wojciech Czaja",
+      "Gavin Taylor",
+      "Michael Moeller",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Wojciech_Czaja1",
+      "~Liam_H_Fowl1",
+      "~W._Ronny_Huang1",
+      "~Jonas_Geiping1",
+      "~Michael_Moeller1",
+      "~Tom_Goldstein1",
+      "~Gavin_Taylor1"
+    ],
+    "rank": 738
+  },
+  {
+    "url": "https://openreview.net/forum?id=SlprFTIQP3",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      5
+    ],
+    "rating": "6.12",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Class imbalance is a fundamental problem in computer vision applications such as semantic segmentation and image classification. Specifically, uneven class distributions in a training dataset often result in unsatisfactory performance on under-represented classes. Many works have proposed to weigh the standard cross entropy loss function with pre-computed weights based on class statistics such as the number of samples and class margins. There are two major drawbacks to these methods: 1) constantly up-weighing minority classes can introduce excessive false positives especially in semantic segmentation; 2) many recent works discovered that pre-computed weights have adversarial effects on representation learning. In this regard, we propose a hard-class mining loss by reshaping the vanilla cross entropy loss such that it weights the loss for each class dynamically based on changing recall performance. We show mathematically that the novel recall loss changes gradually between the standard cross entropy loss and the well-known inverse frequency cross entropy loss and balances precision and accuracy. We first demonstrate that the proposed loss effectively balances precision and accuracy on semantic segmentation datasets, and leads to significant performance improvement over other state-of-the-art loss functions used in semantic segmentation, especially on shallow networks. On image classification, we design a simple two-head training strategy to show that the novel loss function improves representation learning on imbalanced datasets. We outperform the previously best performing method by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>5.7</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on Place365-LT and by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on iNaturalist.",
+    "title": "Recall Loss for Imbalanced Image Classification and Semantic Segmentation",
+    "authors": [
+      "Junjiao Tian",
+      "Niluthpol Chowdhury Mithun",
+      "Zachary Seymour",
+      "Han-pang Chiu",
+      "Zsolt Kira"
+    ],
+    "emails": [
+      "~Junjiao_Tian1",
+      "~Niluthpol_Chowdhury_Mithun1",
+      "~Han-pang_Chiu1",
+      "~Zachary_Seymour1",
+      "~Zsolt_Kira1"
+    ],
+    "rank": 739
+  },
+  {
+    "url": "https://openreview.net/forum?id=XwATtbX3oCz",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.12",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Processing point cloud data is an important component of many real-world systems. As such, a wide variety of point-based approaches have been proposed, reporting steady benchmark improvements over time. We study the key ingredients of this progress and uncover two critical results. First, we find that auxiliary factors like different evaluation schemes, data augmentation strategies, and loss functions, which are independent of the model architecture, make a large difference in performance. The differences are large enough that they obscure the effect of architecture. When these factors are controlled for, PointNet++, a relatively older network, performs competitively with recent methods. Second, a very simple projection-based method, which we refer to as SimpleView, performs surprisingly well. It achieves on par or better results than sophisticated state-of-the-art methods on ModelNet40, while being half the size of PointNet++. It also outperforms state-of-the-art methods on ScanObjectNN, a real-world point cloud benchmark, and demonstrates better cross-dataset generalization.\n",
+    "title": "Revisiting Point Cloud Classification with a Simple and Effective Baseline",
+    "authors": [
+      "Ankit Goyal",
+      "Hei Law",
+      "Bowei Liu",
+      "Alejandro Newell",
+      "Jia Deng"
+    ],
+    "emails": [
+      "~Alejandro_Newell2",
+      "~Bowei_Liu2",
+      "~Hei_Law2",
+      "~Jia_Deng1",
+      "~Ankit_Goyal1"
+    ],
+    "rank": 740
+  },
+  {
+    "url": "https://openreview.net/forum?id=gV3wdEOGy_V",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      8,
+      6
+    ],
+    "rating": "6.12",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "We present Mixture of Contrastive Experts (MiCE), a unified probabilistic clustering framework that simultaneously exploits the discriminative representations learned by contrastive learning and the semantic structures captured by a latent mixture model. Motivated by the mixture of experts, MiCE employs a gating function to partition an unlabeled dataset into subsets according to the latent semantics and multiple experts to discriminate distinct subsets of instances assigned to them in a contrastive learning manner. To solve the nontrivial inference and learning problems caused by the latent variables, we further develop a scalable variant of the Expectation-Maximization (EM) algorithm for MiCE and provide proof of the convergence. Empirically, we evaluate the clustering performance of MiCE on four widely adopted natural image datasets. MiCE achieves significantly better results than various previous methods and a strong contrastive learning baseline.",
+    "title": "MiCE: Mixture of Contrastive Experts for Unsupervised Image Clustering",
+    "authors": [
+      "Tsung Wei Tsai",
+      "Chongxuan Li",
+      "Jun Zhu"
+    ],
+    "emails": [
+      "~Jun_Zhu2",
+      "~Chongxuan_Li1",
+      "~Tsung_Wei_Tsai1"
+    ],
+    "rank": 741
+  },
+  {
+    "url": "https://openreview.net/forum?id=4RbdgBh9gE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      5
+    ],
+    "rating": "6.12",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Effective training of deep neural networks can be challenging, and there remain many open questions on how to best learn these models. Recently developed methods to improve neural network training examine teaching: providing learned information during the training process to improve downstream model performance. In this paper, we take steps towards extending the scope of teaching. We propose a flexible teaching framework using commentaries,  learned meta-information helpful for training on a particular task. We present gradient-based methods to learn commentaries, leveraging recent work on implicit differentiation for scalability. We explore diverse applications of commentaries, from weighting training examples, to parameterising label-dependent data augmentation policies, to representing attention masks that highlight salient image regions. We find that commentaries can improve training speed and/or performance, and provide insights about the dataset and training process. We also observe that commentaries generalise: they can be reused when training new models to obtain performance benefits, suggesting a use-case where commentaries are stored with a dataset and leveraged in future for improved model training. ",
+    "title": "Teaching with Commentaries",
+    "authors": [
+      "Aniruddh Raghu",
+      "Maithra Raghu",
+      "Simon Kornblith",
+      "David Duvenaud",
+      "Geoffrey Hinton"
+    ],
+    "emails": [
+      "~Geoffrey_Hinton1",
+      "~Maithra_Raghu1",
+      "~Simon_Kornblith1",
+      "~David_Duvenaud2",
+      "~Aniruddh_Raghu1"
+    ],
+    "rank": 742
+  },
+  {
+    "url": "https://openreview.net/forum?id=iQQK02mxVIT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.12",
+    "confidences": [
+      3,
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "A data set sampled from a certain population is biased if the subgroups of the population are sampled at proportions that are significantly different from their underlying proportions. Training machine learning models on biased data sets requires correction techniques to compensate for the bias. We consider two commonly-used techniques, resampling and reweighting, that rebalance the proportions of the subgroups to maintain the desired objective function. Though statistically equivalent, it has been observed that resampling outperforms reweighting when combined with stochastic gradient algorithms. By analyzing illustrative examples, we explain the reason behind this phenomenon using tools from dynamical stability and stochastic asymptotics. We also present experiments from regression, classification, and off-policy prediction to demonstrate that this is a general phenomenon. We argue that it is imperative to consider the objective function design and the optimization algorithm together while addressing the sampling bias.\n",
+    "title": "Why resampling outperforms reweighting for correcting sampling bias with stochastic gradients",
+    "authors": [
+      "Jing An",
+      "Lexing Ying",
+      "Yuhua Zhu"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Lexing_Ying1"
+    ],
+    "rank": 743
+  },
+  {
+    "url": "https://openreview.net/forum?id=qrwe7XHTmYb",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      7,
+      5,
+      4
+    ],
+    "rating": "6.12",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Neural network scaling has been critical for improving the model quality in many real-world machine learning applications with vast amounts of training data and compute. Although this trend of scaling is affirmed to be a sure-fire approach for better model quality, there are challenges on the path such as the computation cost,ease of programming, and efficient implementation on parallel devices.  In this paper we demonstrate conditional computation as a remedy to the above mentioned impediments, and demonstrate its efficacy and utility.  We make extensive use of GShard, a module composed of a set of lightweight annotation APIs and an extension to the XLA compiler to enable large scale models with up to trillions of parameters. GShard and conditional computation enable us to scale up multilingual neural machine translation Transformer model with Sparsely-Gated Mixture-of-Experts. We demonstrate that such a giant model with 600 billion parameters can efficiently be trained on 2048 TPU v3 cores in 4 days to achieve far superior quality for translation from 100 languages to English compared to the prior art.",
+    "title": "GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding",
+    "authors": [
+      "Dmitry Lepikhin",
+      "HyoukJoong Lee",
+      "Yuanzhong Xu",
+      "Dehao Chen",
+      "Orhan Firat",
+      "Yanping Huang",
+      "Maxim Krikun",
+      "Noam Shazeer",
+      "Zhifeng Chen"
+    ],
+    "emails": [
+      "~Orhan_Firat1",
+      "google.com",
+      "~Zhifeng_Chen1",
+      "~Yanping_Huang1",
+      "~Noam_Shazeer1"
+    ],
+    "rank": 744
+  },
+  {
+    "url": "https://openreview.net/forum?id=HCSgyPUfeDj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.12",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We present a two-stage framework for deep one-class classification. We first learn self-supervised  representations from one-class data, and then build one-class classifiers on learned representations. The framework not only allows to learn better representations, but also permits building one-class classifiers that are faithful to the target task. We argue that classifiers inspired by the statistical perspective in generative or discriminative models are more effective than existing approaches, such as a normality score from a surrogate classifier. We thoroughly evaluate different self-supervised representation learning algorithms under the proposed framework for one-class classification. Moreover, we present a novel distribution-augmented contrastive learning that extends training distributions via data augmentation to obstruct the uniformity of contrastive representations. In experiments, we demonstrate state-of-the-art performance on visual domain one-class classification benchmarks, including novelty and anomaly detection. Finally, we present visual explanations, confirming that the decision-making process of deep one-class classifiers is intuitive to humans. The code is available at <a href=\"https://github.com/google-research/deep_representation_one_class\" target=\"_blank\" rel=\"nofollow\">https://github.com/google-research/deep_representation_one_class</a>.\n",
+    "title": "Learning and Evaluating Representations for Deep One-Class Classification",
+    "authors": [
+      "Kihyuk Sohn",
+      "Chun-Liang Li",
+      "Jinsung Yoon",
+      "Minho Jin",
+      "Tomas Pfister"
+    ],
+    "emails": [
+      "~Kihyuk_Sohn1",
+      "~Jinsung_Yoon1",
+      "google.com",
+      "~Chun-Liang_Li1",
+      "~Tomas_Pfister1"
+    ],
+    "rank": 745
+  },
+  {
+    "url": "https://openreview.net/forum?id=V1ZHVxJ6dSS",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      4,
+      8,
+      7
+    ],
+    "rating": "6.12",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Large optimization problems with hard constraints arise in many settings, yet classical solvers are often prohibitively slow, motivating the use of deep networks as cheap \"approximate solvers.\" Unfortunately, naive deep learning approaches typically cannot enforce the hard constraints of such problems, leading to infeasible solutions. In this work, we present Deep Constraint Completion and Correction (DC3), an algorithm to address this challenge. Specifically, this method enforces feasibility via a differentiable procedure, which implicitly completes partial solutions to satisfy equality constraints and unrolls gradient-based corrections to satisfy inequality constraints. We demonstrate the effectiveness of DC3 in both synthetic optimization tasks and the real-world setting of AC optimal power flow, where hard constraints encode the physics of the electrical grid. In both cases, DC3 achieves near-optimal objective values while preserving feasibility.",
+    "title": "DC3: A learning method for optimization with hard constraints",
+    "authors": [
+      "Priya L. Donti",
+      "David Rolnick",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~David_Rolnick1",
+      "~J_Zico_Kolter1",
+      "~Priya_L._Donti1"
+    ],
+    "rank": 746
+  },
+  {
+    "url": "https://openreview.net/forum?id=RDiiCiIH3_B",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7
+    ],
+    "rating": "6.11",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "Sketching is a compression technique that can be applied to many problems to solve them quickly and approximately. The matrices used to project data to smaller dimensions are called \"sketches\". In this work, we consider the problem of optimizing sketches to obtain low approximation error over a data distribution. \n\nWe introduce a general framework for \"learning\" and applying CountSketch, a type of sparse sketch. The sketch optimization procedure has two stages: one for optimizing the placements of the sketch's non-zero entries and another for optimizing their values. Next, we provide a way to apply learned sketches that has worst-case guarantees for approximation error. \n\nWe instantiate this framework with three sketching applications: least-squares regression, low-rank approximation (LRA), and k-means clustering. Our experiments demonstrate that our approach substantially decreases approximation error compared to classical and naively learned sketches. \n\nFinally, we investigate the theoretical aspects of our approach. For regression and LRA, we show that our method obtains state-of-the art accuracy for fixed time complexity. For LRA, we prove that it is strictly better to include the first optimization stage for two standard input distributions. For k-means, we derive a more straightforward means of retaining approximation guarantees.",
+    "title": "A framework for learned CountSketch",
+    "authors": [
+      "Simin Liu",
+      "Tianrui Liu",
+      "Ali Vakilian",
+      "Yulin Wan",
+      "David Woodruff"
+    ],
+    "emails": [
+      "~Tianrui_Liu3",
+      "~Simin_Liu1",
+      "~David_Woodruff1",
+      "~Ali_Vakilian1",
+      "~Yulin_Wan2"
+    ],
+    "rank": 747
+  },
+  {
+    "url": "https://openreview.net/forum?id=7t1FcJUWhi3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7
+    ],
+    "rating": "6.11",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "Despite \u2014or maybe because of\u2014 their astonishing capacity to fit data, neural networks are believed to have difficulties extrapolating beyond training data distribution. This work shows that, for extrapolations based on finite transformation groups, a model\u2019s inability to extrapolate is unrelated to its capacity. Rather, the shortcoming is inherited from a learning hypothesis: Examples not explicitly observed with infinitely many training examples have underspecified outcomes in the learner\u2019s model. In order to endow neural networks with the ability to extrapolate over group transformations, we introduce a learning framework counterfactually-guided by the learning hypothesis that any group invariance to (known) transformation groups is mandatory even without evidence, unless the learner deems it inconsistent with the training data. Unlike existing invariance-driven methods for (counterfactual) extrapolations, this framework allows extrapolations from a single environment. Finally, we introduce sequence and image extrapolation tasks that validate our framework and showcase the shortcomings of traditional approaches.",
+    "title": "Neural Networks for Learning Counterfactual G-Invariances from Single Environments",
+    "authors": [
+      "S Chandra Mouli",
+      "Bruno Ribeiro"
+    ],
+    "emails": [
+      "~Bruno_Ribeiro1",
+      "~S_Chandra_Mouli1"
+    ],
+    "rank": 748
+  },
+  {
+    "url": "https://openreview.net/forum?id=qRdED5QjM9e",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6
+    ],
+    "rating": "6.10",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We propose a  distributionally robust learning (DRL) method for unsupervised domain adaptation (UDA)  that scales to modern computer-vision benchmarks.  DRL can be naturally formulated as a competitive two-player game between a predictor and an adversary that is allowed to   corrupt the labels, subject to certain constraints, and reduces to incorporating  a density ratio between the source and target domains (under the standard log loss).  This formulation motivates the use of two neural networks that are jointly trained --- a discriminative network between the source and target domains  for density-ratio estimation, in addition to the standard classification network. The use of a density ratio in DRL prevents the model from being overconfident on target inputs far away from the source domain. Thus,  DRL   provides conservative confidence estimation in the target domain, even when the target labels are not available. This conservatism motivates the use of DRL in  self-training for sample selection, and we term the approach distributionally robust self-training (DRST). In our experiments, DRST generates more calibrated  probabilities and achieves state-of-the-art self-training accuracy on benchmark datasets. We demonstrate that DRST captures  shape features more effectively, and reduces the extent of distributional shift during self-training. ",
+    "title": "Distributionally Robust Learning for Unsupervised Domain Adaptation",
+    "authors": [
+      "Haoxuan Wang",
+      "Anqi Liu",
+      "Zhiding Yu",
+      "Yisong Yue",
+      "Anima Anandkumar"
+    ],
+    "emails": [
+      "sjtu.edu.cn",
+      "~Yisong_Yue1",
+      "~Anima_Anandkumar1",
+      "~Zhiding_Yu1",
+      "~Anqi_Liu2"
+    ],
+    "rank": 749
+  },
+  {
+    "url": "https://openreview.net/forum?id=pULTvw9X313",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      9
+    ],
+    "rating": "6.10",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep learning based 3D shape generation methods generally utilize latent features extracted from color images to encode the objects' semantics and guide the shape generation process. These color image semantics only implicitly encode 3D information, potentially limiting the accuracy of the generated shapes. In this paper we propose a multi-view mesh generation method which incorporates geometry information in the color images explicitly by using the features from intermediate 2.5D depth representations of the input images and regularizing the 3D shapes against these depth images. Our system first predicts a coarse 3D volume from the color images by probabilistically merging voxel occupancy grids from individual views. Depth images corresponding to the multi-view color images are predicted which along with the rendered depth images of the coarse shape are used as a contrastive input whose features guide the refinement of the coarse shape through a series of graph convolution networks. Attention-based multi-view feature pooling is proposed to fuse the contrastive depth features from different viewpoints which are fed to the graph convolution networks.\nWe validate the proposed multi-view mesh generation method on ShapeNet, where we obtain a significant improvement with 34% decrease in chamfer distance to ground truth and 14% increase in the F1-score compared with the state-of-the-art multi-view shape generation method.",
+    "title": "MeshMVS: Multi-view Stereo Guided Mesh Reconstruction",
+    "authors": [
+      "Rakesh Shrestha",
+      "Zhiwen Fan",
+      "Siyu Zhu",
+      "Zuozhuo Dai",
+      "Qingkun Su",
+      "Ping Tan"
+    ],
+    "emails": [
+      "~Zuozhuo_Dai1",
+      "~Zhiwen_Fan1",
+      "~Qingkun_Su1",
+      "~Rakesh_Shrestha1",
+      "~Siyu_Zhu1",
+      "~Ping_Tan2"
+    ],
+    "rank": 750
+  },
+  {
+    "url": "https://openreview.net/forum?id=aAY23UgDBv0",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4
+    ],
+    "rating": "6.10",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep probabilistic time series forecasting models have become an integral part of machine learning. While several powerful generative models have been proposed, we provide evidence that their associated inference models are oftentimes too limited and cause the generative model to predict mode-averaged dynamics. Mode-averaging is problematic since many real-world sequences are highly multi-modal, and their averaged dynamics are unphysical (e.g., predicted taxi trajectories might run through buildings on the street map). To better capture multi-modality, we develop variational dynamic mixtures (VDM): a new variational family to infer sequential latent variables. The VDM approximate posterior at each time step is a mixture density network, whose parameters come from propagating multiple samples through a recurrent architecture. This results in an expressive  multi-modal posterior approximation. In an empirical study, we show that VDM outperforms competing approaches on highly multi-modal datasets from different domains. ",
+    "title": "Variational Dynamic Mixtures",
+    "authors": [
+      "Chen Qiu",
+      "Stephan Mandt",
+      "Maja Rudolph"
+    ],
+    "emails": [
+      "~Maja_Rudolph3",
+      "~Chen_Qiu1",
+      "~Stephan_Mandt1"
+    ],
+    "rank": 751
+  },
+  {
+    "url": "https://openreview.net/forum?id=-M0QkvBGTTq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      9,
+      3,
+      3
+    ],
+    "rating": "6.10",
+    "confidences": [
+      4,
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Advanced data augmentation strategies have widely been studied to improve the generalization ability of deep learning models. Regional dropout is one of the popular solutions that guides the model to focus on less discriminative parts by randomly removing image regions, resulting in improved regularization. However, such information removal is undesirable. On the other hand, recent strategies suggest to randomly cut and mix patches and their labels among training images, to enjoy the advantages of regional dropout without having any pointless pixel in the augmented images. We argue that such random selection strategies of the patches may not necessarily represent sufficient information about the corresponding object and thereby mixing the labels according to that uninformative patch enables the model to learn unexpected feature representation. Therefore, we propose SaliencyMix that carefully selects a representative image patch with the help of a saliency map and mixes this indicative patch with the target image, thus leading the model to learn more appropriate feature representation. SaliencyMix achieves the best known top-1 error of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>21.26</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>20.09</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for ResNet-50 and ResNet-101 architectures on ImageNet classification, respectively, and also improves the model robustness against adversarial perturbations. Furthermore, models that are trained with SaliencyMix, help to improve the object detection performance.  Source code is available at \\url{<a href=\"https://github.com/SaliencyMix/SaliencyMix}\" target=\"_blank\" rel=\"nofollow\">https://github.com/SaliencyMix/SaliencyMix}</a>.",
+    "title": "SaliencyMix: A Saliency Guided Data Augmentation Strategy for Better Regularization",
+    "authors": [
+      "A F M Shahab Uddin",
+      "Mst. Sirazam Monira",
+      "Wheemyung Shin",
+      "TaeChoong Chung",
+      "Sung-Ho Bae"
+    ],
+    "emails": [
+      "~Sung-Ho_Bae1",
+      "khu.ac.kr",
+      "~A_F_M_Shahab_Uddin1",
+      "~Mst._Sirazam_Monira1"
+    ],
+    "rank": 752
+  },
+  {
+    "url": "https://openreview.net/forum?id=FX0vR39SJ5q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "6.10",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Graphs are one of the most important data structures for representing pairwise relations between objects. Specifically, a graph embedded in a Euclidean space is essential to solving real problems, such as physical simulations. A crucial requirement for applying graphs in Euclidean spaces to physical simulations is learning and inferring the isometric transformation invariant and equivariant features in a computationally efficient manner. In this paper, we propose a set of transformation invariant and equivariant models based on graph convolutional networks, called IsoGCNs. We demonstrate that the proposed model has a competitive performance compared to state-of-the-art methods on tasks related to geometrical and physical simulation data. Moreover, the proposed model can scale up to graphs with 1M vertices and conduct an inference faster than a conventional finite element analysis, which the existing equivariant models cannot achieve.",
+    "title": "Isometric Transformation Invariant and Equivariant Graph Convolutional Networks",
+    "authors": [
+      "Masanobu Horie",
+      "Naoki Morita",
+      "Toshiaki Hishinuma",
+      "Yu Ihara",
+      "Naoto Mitsume"
+    ],
+    "emails": [
+      "~Masanobu_Horie1",
+      "kz.tsukuba.ac.jp",
+      "ricos.co.jp"
+    ],
+    "rank": 753
+  },
+  {
+    "url": "https://openreview.net/forum?id=J8_GttYLFgr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.09",
+    "confidences": [
+      2,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Trajectory prediction is a critical part of many AI applications, for example, the safe operation of autonomous vehicles. However, current methods are prone to making inconsistent and physically unrealistic predictions. We leverage insights from  fluid dynamics to overcome this limitation by considering internal symmetry in real-world trajectories. We propose a novel model, Equivariant Continous COnvolution (ECCO) for improved trajectory prediction.  ECCO uses rotationally-equivariant continuous convolutions to embed the symmetries of the system. On both vehicle and pedestrian trajectory datasets, ECCO attains competitive accuracy  with significantly fewer parameters. It is also more sample efficient, generalizing automatically from few data points in any orientation.  Lastly, ECCO improves generalization with equivariance, resulting in more physically consistent predictions.   Our method provides a fresh perspective towards increasing trust and transparency in deep learning models. Our code and data can be found at <a href=\"https://github.com/Rose-STL-Lab/ECCO\" target=\"_blank\" rel=\"nofollow\">https://github.com/Rose-STL-Lab/ECCO</a>.",
+    "title": "Trajectory Prediction using Equivariant Continuous Convolution",
+    "authors": [
+      "Robin Walters",
+      "Jinxi Li",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Rose_Yu1",
+      "northeastern.edu",
+      "~Robin_Walters1"
+    ],
+    "rank": 754
+  },
+  {
+    "url": "https://openreview.net/forum?id=lNrtNGkr-vw",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5
+    ],
+    "rating": "6.09",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper introduces Fast Linearized Adaptive Policy (FLAP), a new meta-reinforcement learning (meta-RL) method that is able to extrapolate well to out-of-distribution tasks without the need to reuse data from training, and adapt almost instantaneously with the need of only a few samples during testing. FLAP builds upon the idea of learning a shared linear representation of the policy so that when adapting to a new task, it suffices to predict a set of linear weights. A separate adapter network is trained simultaneously with the policy such that during adaptation, we can directly use the adapter network to predict these linear weights instead of updating a meta-policy via gradient descent such as in prior Meta-RL algorithms like  MAML to obtain the new policy. The application of the separate feed-forward network not only speeds up the adaptation run-time significantly, but also generalizes extremely well to very different tasks that prior Meta-RL methods fail to generalize to. Experiments on standard continuous-control meta-RL benchmarks show FLAP presenting significantly stronger performance on out-of-distribution tasks with up to double the average return and up to 8X faster adaptation run-time speeds when compared to prior methods.",
+    "title": "Linear Representation Meta-Reinforcement Learning for Instant Adaptation",
+    "authors": [
+      "Matt Peng",
+      "Banghua Zhu",
+      "Jiantao Jiao"
+    ],
+    "emails": [
+      "~Banghua_Zhu1",
+      "~Jiantao_Jiao1",
+      "~Matt_Peng1"
+    ],
+    "rank": 755
+  },
+  {
+    "url": "https://openreview.net/forum?id=jMc7DlflrMC",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      7
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Constrained reinforcement learning (CRL) plays an important role in solving safety-critical and resource-limited tasks. However, existing methods typically rely on tuning reward or cost parameters to encode the constraints, which can be tedious and tend to not generalize well. Instead of building sophisticated cost functions for constraints, we present a pioneering study of imposing constraints directly on the state density function of the system. Density functions have clear physical meanings and can express a variety of constraints in a straightforward fashion. We prove the duality between the density function and Q function in CRL and use it to develop an effective primal-dual algorithm to solve density constrained reinforcement learning problems. We provide theoretical guarantees of the optimality of our approach and use a comprehensive set of case studies including standard benchmarks to show that our method outperforms other leading CRL methods in terms of achieving higher reward while respecting the constraints.",
+    "title": "Density Constrained Reinforcement Learning",
+    "authors": [
+      "Zengyi Qin",
+      "Yuxiao Chen",
+      "Chuchu Fan"
+    ],
+    "emails": [
+      "~Zengyi_Qin1",
+      "~Chuchu_Fan2",
+      "~Yuxiao_Chen1"
+    ],
+    "rank": 756
+  },
+  {
+    "url": "https://openreview.net/forum?id=PrzjugOsDeE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.08",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "This work proposes the continuous conditional generative adversarial network (CcGAN), the first generative model for image generation conditional on continuous, scalar conditions (termed regression labels). Existing conditional GANs (cGANs) are mainly designed for categorical conditions (e.g., class labels); conditioning on a continuous label is mathematically distinct and raises two fundamental problems: (P1) Since there may be very few (even zero) real images for some regression labels, minimizing existing empirical versions of cGAN losses (a.k.a. empirical cGAN losses) often fails in practice; (P2) Since regression labels are scalar and infinitely many, conventional label input methods (e.g., combining a hidden map of the generator/discriminator with a one-hot encoded label) are not applicable. The proposed CcGAN solves the above problems, respectively, by (S1) reformulating existing empirical cGAN losses to be appropriate for the continuous scenario; and (S2) proposing a novel method to incorporate regression labels into the generator and the discriminator. The reformulation in (S1) leads to two novel empirical discriminator losses, termed the hard vicinal discriminator loss (HVDL) and the soft vicinal discriminator loss (SVDL) respectively, and a novel empirical generator loss. The error bounds of a discriminator trained with HVDL and SVDL are derived under mild assumptions in this work. A new benchmark dataset, RC-49, is also proposed for generative image modeling conditional on regression labels. Our experiments on the Circular 2-D Gaussians, RC-49, and UTKFace datasets show that CcGAN is able to generate diverse, high-quality samples from the image distribution conditional on a given regression label. Moreover, in these experiments, CcGAN substantially outperforms cGAN both visually and quantitatively.",
+    "title": "CcGAN: Continuous Conditional Generative Adversarial Networks for Image Generation",
+    "authors": [
+      "Xin Ding",
+      "Yongwei Wang",
+      "Zuheng Xu",
+      "William J Welch",
+      "Z. Jane Wang"
+    ],
+    "emails": [
+      "~Xin_Ding2",
+      "~Z._Jane_Wang1",
+      "~Zuheng_Xu1",
+      "~Yongwei_Wang1",
+      "~William_J_Welch1"
+    ],
+    "rank": 757
+  },
+  {
+    "url": "https://openreview.net/forum?id=r-gPPHEjpmw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      4,
+      4
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      3,
+      2,
+      3
+    ],
+    "abstract": "We propose a hierarchical reinforcement learning method, HIDIO, that can learn task-agnostic options in a self-supervised manner while jointly learning to utilize them to solve sparse-reward tasks. Unlike current hierarchical RL approaches that tend to formulate goal-reaching low-level tasks or pre-define ad hoc lower-level policies, HIDIO encourages lower-level option learning that is independent of the task at hand, requiring few assumptions or little knowledge about the task structure. These options are learned through an intrinsic entropy minimization objective conditioned on the option sub-trajectories. The learned options are diverse and task-agnostic. In experiments on sparse-reward robotic manipulation and navigation tasks, HIDIO achieves higher success rates with greater sample efficiency than regular RL baselines and two state-of-the-art hierarchical RL methods. Code at: <a href=\"https://github.com/jesbu1/hidio\" target=\"_blank\" rel=\"nofollow\">https://github.com/jesbu1/hidio</a>.",
+    "title": "Hierarchical Reinforcement Learning by Discovering Intrinsic Options",
+    "authors": [
+      "Jesse Zhang",
+      "Haonan Yu",
+      "Wei Xu"
+    ],
+    "emails": [
+      "~Jesse_Zhang3",
+      "~Haonan_Yu5",
+      "~Wei_Xu13"
+    ],
+    "rank": 758
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mu2ZxFctAI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "For pool-based active learning, in each iteration a candidate training sample is chosen for labeling by optimizing an acquisition function. In Bayesian classification, expected Loss Reduction~(ELR) methods maximize the expected reduction in the classification error given a new labeled candidate based on a one-step-look-ahead strategy. ELR is the optimal strategy with a single query; however, since such myopic strategies cannot identify the long-term effect of a query on the classification error, ELR may get stuck before reaching the optimal classifier.  In this paper, inspired by the mean objective cost of uncertainty (MOCU), a metric quantifying the uncertainty directly affecting the classification error, we propose an acquisition function based on a weighted form of MOCU. Similar to ELR, the proposed method focuses on the reduction of the uncertainty that pertains to the classification error. But unlike any other existing scheme, it provides the critical advantage that the resulting Bayesian active learning algorithm guarantees convergence to the optimal classifier of the true model. We demonstrate its performance with both synthetic and real-world datasets.",
+    "title": "Uncertainty-aware Active Learning for Optimal Bayesian Classifier",
+    "authors": [
+      "Guang Zhao",
+      "Edward Dougherty",
+      "Byung-Jun Yoon",
+      "Francis Alexander",
+      "Xiaoning Qian"
+    ],
+    "emails": [
+      "~Edward_Dougherty1",
+      "~Byung-Jun_Yoon1",
+      "~Guang_Zhao1",
+      "bnl.gov",
+      "~Xiaoning_Qian2"
+    ],
+    "rank": 759
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ozk9MrX1hvA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Data augmentation has been demonstrated as an effective strategy for improving model generalization and data efficiency.  However, due to the discrete nature of natural language, designing label-preserving transformations for text data tends to be more challenging. In this paper, we propose a novel data augmentation frame-work dubbed CoDA, which synthesizes diverse and informative augmented examples by integrating multiple transformations organically.  Moreover, a contrastive regularization is introduced to capture the global relationship among all the data samples.  A momentum encoder along with a memory bank is further leveraged to better estimate the contrastive loss. To verify the effectiveness of the proposed framework, we apply CoDA to Transformer-based models on a wide range of natural language understanding tasks. On the GLUE benchmark, CoDA gives rise to an average improvement of 2.2%while applied to the Roberta-large model. More importantly, it consistently exhibits stronger results relative to several competitive data augmentation and adversarial training baselines (including the low-resource settings). Extensive experiments show that the proposed contrastive objective can be flexibly combined with various data augmentation approaches to further boost their performance, highlighting the wide applicability of the CoDA framework.",
+    "title": "CoDA: Contrast-enhanced and Diversity-promoting Data Augmentation for Natural Language Understanding",
+    "authors": [
+      "Yanru Qu",
+      "Dinghan Shen",
+      "Yelong Shen",
+      "Sandra Sajeev",
+      "Weizhu Chen",
+      "Jiawei Han"
+    ],
+    "emails": [
+      "~Yanru_Qu1",
+      "microsoft.com",
+      "~Yelong_Shen2",
+      "~Weizhu_Chen1",
+      "~Dinghan_Shen1",
+      "~Jiawei_Han1"
+    ],
+    "rank": 760
+  },
+  {
+    "url": "https://openreview.net/forum?id=IMPA6MndSXU",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      4,
+      7
+    ],
+    "rating": "6.08",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "While unsupervised domain translation (UDT) has seen a lot of success recently, we argue that mediating its translation via categorical semantic features could broaden its applicability. In particular, we demonstrate that categorical semantics improves the translation between perceptually different domains sharing multiple object categories. We propose a method to learn, in an unsupervised manner, categorical semantic features (such as object labels) that are invariant of the source and target domains. We show that conditioning the style encoder of unsupervised domain translation methods on the learned categorical semantics leads to a translation preserving the digits on MNIST<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2194\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">\u2194</mo></math></mjx-assistive-mml></mjx-container>SVHN and to a more realistic stylization on Sketches<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo accent=\"false\" stretchy=\"false\">\u2192</mo></math></mjx-assistive-mml></mjx-container>Reals.",
+    "title": "Integrating Categorical Semantics into Unsupervised Domain Translation",
+    "authors": [
+      "Samuel Lavoie-Marchildon",
+      "Faruk Ahmed",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Faruk_Ahmed1",
+      "~Samuel_Lavoie-Marchildon1",
+      "~Aaron_Courville3"
+    ],
+    "rank": 761
+  },
+  {
+    "url": "https://openreview.net/forum?id=GVNGAaY2Dr1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      3,
+      3,
+      2
+    ],
+    "abstract": "Recent advances in multi-agent reinforcement learning (MARL) have achieved super-human performance in games like Quake 3 and Dota 2. Unfortunately, these techniques require orders-of-magnitude more training rounds than humans and don't generalize to new agent configurations even on the same game. In this work, we propose Collaborative Q-learning (CollaQ) that achieves state-of-the-art performance in the StarCraft multi-agent challenge and supports ad hoc team play. We first formulate multi-agent collaboration as a joint optimization on reward assignment and show that each agent has an approximately optimal policy that decomposes into two parts: one part that only relies on the agent's own state, and the other part that is related to states of nearby agents. Following this novel finding, CollaQ decomposes the Q-function of each agent into a self term and an interactive term, with a Multi-Agent Reward Attribution (MARA) loss that regularizes the training. CollaQ is evaluated on various StarCraft maps and shows that it outperforms existing state-of-the-art techniques  (i.e., QMIX, QTRAN, and VDN) by improving the win rate by 40% with the same number of samples. In the more challenging ad hoc team play setting (i.e., reweight/add/remove units without re-training or finetuning), CollaQ outperforms previous SoTA by over 30%. ",
+    "title": "Multi-Agent Collaboration via Reward Attribution Decomposition",
+    "authors": [
+      "Tianjun Zhang",
+      "Huazhe Xu",
+      "Xiaolong Wang",
+      "Yi Wu",
+      "Kurt Keutzer",
+      "Joseph E. Gonzalez",
+      "Yuandong Tian"
+    ],
+    "emails": [
+      "~Kurt_Keutzer1",
+      "~Huazhe_Xu1",
+      "~Joseph_E._Gonzalez1",
+      "~Xiaolong_Wang3",
+      "~Tianjun_Zhang1",
+      "~Yuandong_Tian1",
+      "~Yi_Wu1"
+    ],
+    "rank": 762
+  },
+  {
+    "url": "https://openreview.net/forum?id=Tt1s9Oi1kCS",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      8
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Attaining prototypical features to represent class distributions is well established in representation learning. However, learning prototypes online from streams of data proves a challenging endeavor as they rapidly become outdated, caused by an ever-changing parameter space in the learning process. Additionally, continual learning assumes a non-stationary nature of the data stream, typically resulting in catastrophic forgetting of previous knowledge. As a first, we introduce a system addressing both problems, where prototypes evolve continually in a shared latent space, enabling learning and prediction at any point in time. In contrast to the major body of work in continual learning, data streams are processed in an online fashion, without additional task-information, and an efficient memory scheme provides robustness to imbalanced data streams. Besides nearest neighbor based prediction, learning is facilitated by a novel objective function, encouraging cluster density about the class prototype and increased inter-class variance. Furthermore, the latent space quality is elevated by pseudo-prototypes in each batch, constituted by replay of exemplars from memory. We generalize the existing paradigms in continual learning to incorporate data incremental learning from data streams by formalizing a two-agent learner-evaluator framework, and obtain state-of-the-art performance by a significant margin on eight benchmarks, including three highly imbalanced data streams.",
+    "title": "Continual Prototype Evolution: Learning Online from Non-Stationary Data Streams",
+    "authors": [
+      "Matthias De Lange",
+      "Tinne Tuytelaars"
+    ],
+    "emails": [
+      "~Matthias_De_Lange1",
+      "~Tinne_Tuytelaars1"
+    ],
+    "rank": 763
+  },
+  {
+    "url": "https://openreview.net/forum?id=UoaQUQREMOs",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      5,
+      7,
+      7
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "3D convolution is powerful for video classification but often computationally expensive, recent studies mainly focus on decomposing it on spatial-temporal and/or channel dimensions.  Unfortunately,  most approaches fail to achieve a preferable balance between convolutional efficiency and feature-interaction sufficiency.  For this reason,  we propose a concise and novel Channel Tensorization Network (CT-Net),  by treating the channel dimension of input feature as a multiplication of K sub-dimensions. On one hand,  it naturally factorizes convolution in a multiple dimension way,  leading to a light computation burden.  On the other hand, it can effectively enhance feature interaction from different channels,  and progressively enlarge the 3D receptive field of such interaction to boost classification accuracy.  Furthermore, we equip our CT-Module with a Tensor Excitation (TE) mechanism. It can learn to exploit spatial, temporal and channel attention in a high-dimensional manner, to improve the cooperative power of all the feature dimensions in our CT-Module. Finally, we flexibly adapt ResNet as our CT-Net. Extensive experiments are conducted on several challenging video benchmarks, e.g., Kinetics-400, Something-Something V1 and V2. Our CT-Net outperforms a number of recent SOTA approaches, in terms of accuracy and/or efficiency.",
+    "title": "CT-Net: Channel Tensorization Network for Video Classification",
+    "authors": [
+      "Kunchang Li",
+      "Xianhang Li",
+      "Yali Wang",
+      "Jun Wang",
+      "Yu Qiao"
+    ],
+    "emails": [
+      "~Kunchang_Li1",
+      "~Yali_Wang1",
+      "~Yu_Qiao1",
+      "~Xianhang_Li1",
+      "~Jun_Wang7"
+    ],
+    "rank": 764
+  },
+  {
+    "url": "https://openreview.net/forum?id=V5j-jdoDDP",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      5
+    ],
+    "rating": "6.08",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Symbolic techniques based on Satisfiability Modulo Theory (SMT) solvers have been proposed for analyzing and verifying neural network properties, but their usage has been fairly limited owing to their poor scalability with larger networks. In this work, we propose a technique for combining gradient-based methods with symbolic techniques to scale such analyses and demonstrate its application for model explanation. In particular, we apply this technique to identify minimal regions in an input that are most relevant for a neural network's prediction. Our approach uses gradient information (based on Integrated Gradients) to focus on a subset of neurons in the first layer, which allows our technique to scale to large networks. The corresponding SMT constraints encode the minimal input mask discovery problem such that after masking the input, the activations of the selected neurons are still above a threshold. After solving for the minimal masks, our approach scores the mask regions to generate a relative ordering of the features within the mask. This produces a saliency map which explains\" where a model is looking\" when making a prediction. We evaluate our technique on three datasets-MNIST, ImageNet, and Beer Reviews, and demonstrate both quantitatively and qualitatively that the regions generated by our approach are sparser and achieve higher saliency scores compared to the gradient-based methods alone. Code and examples are at - <a href=\"https://github.com/google-research/google-research/tree/master/smug_saliency\" target=\"_blank\" rel=\"nofollow\">https://github.com/google-research/google-research/tree/master/smug_saliency</a>",
+    "title": "Scaling Symbolic Methods using Gradients for Neural Model Explanation",
+    "authors": [
+      "Subham Sekhar Sahoo",
+      "Subhashini Venugopalan",
+      "Li Li",
+      "Rishabh Singh",
+      "Patrick Riley"
+    ],
+    "emails": [
+      "~Patrick_Riley2",
+      "~Subhashini_Venugopalan2",
+      "~Li_Li8",
+      "~Rishabh_Singh1",
+      "google.com"
+    ],
+    "rank": 765
+  },
+  {
+    "url": "https://openreview.net/forum?id=33rtZ4Sjwjn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      5,
+      6
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Standard Convolutional Neural Networks (CNNs) can be easily fooled by images with small quasi-imperceptible artificial perturbations. As alternatives to CNNs, the recently proposed Capsule Networks (CapsNets) are shown to be more robust to white-box attack than CNNs under popular attack protocols. Besides, the class-conditional reconstruction part of CapsNets is also used to detect adversarial examples. In this work, we investigate the adversarial robustness of CapsNets, especially how the inner workings of CapsNets change when the output capsules are attacked. The first observation is that adversarial examples misled CapsNets by manipulating the votes from primary capsules. Another observation is the high computational cost, when we directly apply multi-step attack methods designed for CNNs to attack CapsNets, due to the computationally expensive routing mechanism. Motivated by these two observations, we propose a novel vote attack where we attack votes of CapsNets directly. Our vote attack is not only effective, but also efficient by circumventing the routing process. Furthermore, we integrate our vote attack into the detection-aware attack paradigm, which can successfully bypass the class-conditional reconstruction based detection method. Extensive experiments demonstrate the superior attack performance of our vote attack on CapsNets.",
+    "title": "Effective and Efficient Vote Attack on Capsule Networks",
+    "authors": [
+      "Jindong Gu",
+      "Baoyuan Wu",
+      "Volker Tresp"
+    ],
+    "emails": [
+      "~Baoyuan_Wu1",
+      "~Volker_Tresp1",
+      "~Jindong_Gu1"
+    ],
+    "rank": 766
+  },
+  {
+    "url": "https://openreview.net/forum?id=MJAqnaC2vO1",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      5,
+      7
+    ],
+    "rating": "6.08",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Designing proper loss functions is essential in training deep networks. Especially in the field of semantic segmentation, various evaluation metrics have been proposed for diverse scenarios. Despite the success of the widely adopted cross-entropy loss and its variants, the mis-alignment between the loss functions and evaluation metrics degrades the network performance. Meanwhile, manually designing loss functions for each specific metric requires expertise and significant manpower. In this paper, we propose to automate the design of metric-specific loss functions by searching differentiable surrogate losses for each metric. We substitute the non-differentiable operations in the metrics with parameterized functions, and conduct parameter search to optimize the shape of loss surfaces. Two constraints are introduced to regularize the search space and make the search efficient. Extensive experiments on PASCAL VOC and Cityscapes demonstrate that the searched surrogate losses outperform the manually designed loss functions consistently. The searched losses can generalize well to other datasets and networks. Code shall be released at <a href=\"https://github.com/fundamentalvision/Auto-Seg-Loss\" target=\"_blank\" rel=\"nofollow\">https://github.com/fundamentalvision/Auto-Seg-Loss</a>.",
+    "title": "Auto Seg-Loss: Searching Metric Surrogates for Semantic Segmentation",
+    "authors": [
+      "Hao Li",
+      "Chenxin Tao",
+      "Xizhou Zhu",
+      "Xiaogang Wang",
+      "Gao Huang",
+      "Jifeng Dai"
+    ],
+    "emails": [
+      "~Jifeng_Dai1",
+      "~Xiaogang_Wang2",
+      "link.cuhk.edu.hk",
+      "~Xizhou_Zhu1",
+      "~Chenxin_Tao2",
+      "~Gao_Huang1"
+    ],
+    "rank": 767
+  },
+  {
+    "url": "https://openreview.net/forum?id=uFHwB6YTxXz",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      6
+    ],
+    "rating": "6.08",
+    "confidences": [
+      3,
+      2,
+      3,
+      4
+    ],
+    "abstract": "Recent advances in deep learning from probability distributions successfully achieve classification or regression from distribution samples, thus invariant under permutation of the samples. The first contribution of the paper is to extend these neural architectures to achieve invariance under permutation of the features, too.  The proposed architecture, called Dida, inherits the NN properties of universal approximation, and its robustness with respect to Lipschitz-bounded transformations of the input distribution is established. The second contribution is to empirically and comparatively demonstrate the merits of the approach on two tasks defined at the dataset level. On both tasks, Dida learns meta-features supporting the characterization of a (labelled) dataset. The first task consists of predicting whether two dataset patches are extracted from the same initial dataset. The second task consists of predicting whether the learning performance achieved by a hyper-parameter configuration under a fixed algorithm (ranging in k-NN, SVM, logistic regression and linear SGD) dominates that of another configuration, for a dataset extracted from the OpenML benchmarking suite. On both tasks, Dida outperforms the state of the art: DSS and Dataset2Vec architectures, as well as the models based on the hand-crafted meta-features of the literature. ",
+    "title": "Distribution-Based Invariant Deep Networks for Learning Meta-Features",
+    "authors": [
+      "Gwendoline de Bie",
+      "Herilalaina Rakotoarison",
+      "Gabriel Peyr\u00e9",
+      "Mich\u00e8le Sebag"
+    ],
+    "emails": [
+      "~Herilalaina_Rakotoarison1",
+      "~Gwendoline_de_Bie1",
+      "~Gabriel_Peyr\u00e92",
+      "~Mich\u00e8le_Sebag1"
+    ],
+    "rank": 768
+  },
+  {
+    "url": "https://openreview.net/forum?id=9FWas6YbmB3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.08",
+    "confidences": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "This paper proposes a novel differentiable architecture search method by formulating it into a distribution learning problem. We treat the continuously relaxed architecture mixing weight as random variables, modeled by Dirichlet distribution. With recently developed pathwise derivatives, the Dirichlet parameters can be easily optimized with gradient-based optimizer in an end-to-end manner. This formulation improves the generalization ability and induces stochasticity that naturally encourages exploration in the search space. Furthermore, to alleviate the large memory consumption of differentiable NAS, we propose a simple yet effective progressive learning scheme that enables searching directly on large-scale tasks, eliminating the gap between search and evaluation phases. Extensive experiments demonstrate the effectiveness of our method. Specifically, we obtain a test error of 2.46\\% for CIFAR-10, 23.7\\% for ImageNet under the mobile setting. On NAS-Bench-201, we also achieve state-of-the-art results on all three datasets and provide insights for the effective design of neural architecture search algorithms.",
+    "title": "DrNAS: Dirichlet Neural Architecture Search",
+    "authors": [
+      "Xiangning Chen",
+      "Ruochen Wang",
+      "Minhao Cheng",
+      "Xiaocheng Tang",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Minhao_Cheng1",
+      "~Cho-Jui_Hsieh1",
+      "~Xiaocheng_Tang1",
+      "~Xiangning_Chen1",
+      "~Ruochen_Wang2"
+    ],
+    "rank": 769
+  },
+  {
+    "url": "https://openreview.net/forum?id=dgtpE6gKjHn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.08",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Federated learning aims to collaboratively train a strong global model by accessing users' locally trained models but not their own data. A crucial step is therefore to aggregate local models into a global model, which has been shown challenging when users have non-i.i.d. data. In this paper, we propose a novel aggregation algorithm named FedBE, which takes a Bayesian inference perspective by sampling higher-quality global models and combining them via Bayesian model Ensemble, leading to much robust aggregation. We show that an effective model distribution can be constructed by simply fitting a Gaussian or Dirichlet distribution to the local models. Our empirical studies validate FedBE's superior performance, especially when users' data are not i.i.d. and when the neural networks go deeper. Moreover, FedBE is compatible with recent efforts in regularizing users' model training, making it an easily applicable module: you only need to replace the aggregation method but leave other parts of your federated learning algorithm intact.",
+    "title": "FedBE: Making Bayesian Model Ensemble Applicable to Federated Learning",
+    "authors": [
+      "Hong-You Chen",
+      "Wei-Lun Chao"
+    ],
+    "emails": [
+      "~Hong-You_Chen1",
+      "~Wei-Lun_Chao1"
+    ],
+    "rank": 770
+  },
+  {
+    "url": "https://openreview.net/forum?id=c3MWGN_cTf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      5,
+      6
+    ],
+    "rating": "6.08",
+    "confidences": [
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Fictitious Self-Play (FSP) has achieved significant empirical success in solving extensive-form games. \nHowever, from a theoretical perspective, it remains unknown whether FSP is guaranteed to converge to Nash equilibria in Markov games.\nAs an initial attempt, we propose an FSP algorithm for two-player zero-sum Markov games, dubbed as smooth FSP, where both agents adopt an entropy-regularized policy optimization method against each other. \nSmooth FSP builds upon a connection between smooth fictitious play and the policy optimization framework. Specifically, in each iteration, each player infers the policy of the opponent implicitly via policy evaluation and improves its current policy by taking the smoothed best-response via a proximal policy optimization (PPO) step. \nMoreover, to tame the non-stationarity caused by the opponent, we propose to incorporate entropy regularization in PPO for algorithmic stability. \nWhen both players adopt smooth FSP simultaneously, i.e., with self-play, we prove that the sequence of joint policies converges to a neighborhood of a Nash equilibrium at a sublinear <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>T</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> rate, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the number of iterations. To our best knowledge, we establish the first finite-time convergence guarantee for FSP-type algorithms in zero-sum Markov games.",
+    "title": "Policy Optimization in Zero-Sum Markov Games: Fictitious Self-Play Provably Attains Nash Equilibria",
+    "authors": [
+      "Boyi Liu",
+      "Zhuoran Yang",
+      "Zhaoran Wang"
+    ],
+    "emails": [
+      "~Zhaoran_Wang1",
+      "~Zhuoran_Yang1",
+      "~Boyi_Liu1"
+    ],
+    "rank": 771
+  },
+  {
+    "url": "https://openreview.net/forum?id=LNtTXJ9XXr",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Adversarial training is a commonly used technique to improve model robustness against adversarial examples. Despite its success as a defense mechanism, adversarial training often fails to generalize well to unperturbed test data. While previous work assumes it is caused by the discrepancy between robust and non-robust features, in this paper, we introduce \\emph{Adversarial Masking}, a new hypothesis that this trade-off is caused by different feature maskings applied. Specifically, the rescaling operation in the batch normalization layer, when combined together with ReLU activation, serves as a feature masking layer to select different features for model training. By carefully manipulating different maskings, a well-balanced trade-off can be achieved between model performance on unperturbed and perturbed data. Built upon this hypothesis, we further propose Robust Masking (RobMask),  which constructs unique masking for every specific attack perturbation by learning a set of primary adversarial feature maskings. By incorporating different feature maps after the masking, we can distill better features to help model generalization. Sufficiently, adversarial training can be treated as an effective regularizer to achieve better generalization. Experiments on multiple benchmarks demonstrate that RobMask achieves significant improvement on clean test accuracy compared to strong state-of-the-art baselines.",
+    "title": "Adversarial Masking: Towards Understanding Robustness Trade-off for Generalization",
+    "authors": [
+      "Minhao Cheng",
+      "Zhe Gan",
+      "Yu Cheng",
+      "Shuohang Wang",
+      "Cho-Jui Hsieh",
+      "Jingjing Liu"
+    ],
+    "emails": [
+      "~Minhao_Cheng1",
+      "~Cho-Jui_Hsieh1",
+      "~Jingjing_Liu2",
+      "~Shuohang_Wang1",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1"
+    ],
+    "rank": 772
+  },
+  {
+    "url": "https://openreview.net/forum?id=-Lr-u0b42he",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We present neural architectures that disentangle RGB-D images into objects\u2019 shapes and styles and a map of the background scene, and explore their applications for few-shot 3D object detection and few-shot concept classification. Our networks incorporate architectural biases that reflect the image formation process, 3D  geometry of the world scene, and shape-style interplay. They are trained end-to-end self-supervised by predicting views in static scenes, alongside a small number of 3D object boxes. Objects and scenes are represented in terms of 3D feature grids in the bottleneck of the network. We show the proposed 3D neural representations are compositional: they can generate novel 3D scene feature maps by mixing object shapes and styles, resizing and adding the resulting object 3D feature maps over background scene feature maps. We show object detectors trained on hallucinated 3D neural scenes generalize better to novel environments. We show classifiers for object categories, color, materials, and spatial relationships trained over the  disentangled 3D feature sub-spaces generalize better with dramatically fewer exemplars over the current state-of-the-art, and enable a visual question answering system that uses them as its modules to generalize one-shot to novel objects in the scene.",
+    "title": "Disentangling 3D Prototypical Networks for Few-Shot Concept Learning",
+    "authors": [
+      "Mihir Prabhudesai",
+      "Shamit Lal",
+      "Darshan Patil",
+      "Hsiao-Yu Tung",
+      "Adam W Harley",
+      "Katerina Fragkiadaki"
+    ],
+    "emails": [
+      "~Katerina_Fragkiadaki1",
+      "~Hsiao-Yu_Tung1",
+      "~Darshan_Patil1",
+      "~Shamit_Lal1",
+      "~Adam_W_Harley1",
+      "~Mihir_Prabhudesai1"
+    ],
+    "rank": 773
+  },
+  {
+    "url": "https://openreview.net/forum?id=1dm_j4ciZp",
+    "decision": "Significant concerns (Do not publish)",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many optimizers have been proposed for training deep neural networks, and they often have multiple hyperparameters, which make it tricky to benchmark their performance. In this work, we propose a new benchmarking protocol to evaluate both end-to-end efficiency (training a model from scratch without knowing the best hyperparameter) and data-addition training efficiency (the previously selected hyperparameters are used for periodically re-training the model with newly collected data). For end-to-end efficiency, unlike previous work that assumes random hyperparameter tuning, which over-emphasizes the tuning time, we propose to evaluate with a bandit hyperparameter tuning strategy. A human study is conducted to show our evaluation protocol matches human tuning behavior better than the random search. For data-addition training, we propose a new protocol for assessing the hyperparameter sensitivity to data shift. We then apply the proposed benchmarking framework to 7 optimizers and various tasks, including computer vision, natural language processing, reinforcement learning, and graph mining. Our results show that there is no clear winner across all the tasks. \n",
+    "title": "How much progress have we made in neural network training? A New Evaluation Protocol for Benchmarking Optimizers",
+    "authors": [
+      "Yuanhao Xiong",
+      "Xuanqing Liu",
+      "Li-Cheng Lan",
+      "Yang You",
+      "Si Si",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Cho-Jui_Hsieh1",
+      "~Yuanhao_Xiong1",
+      "~Li-Cheng_Lan1",
+      "~Yang_You1",
+      "~Si_Si1",
+      "~Xuanqing_Liu1"
+    ],
+    "rank": 774
+  },
+  {
+    "url": "https://openreview.net/forum?id=RqCC_00Bg7V",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Model-Predictive Control (MPC) is a powerful tool for controlling complex, real-world systems that uses a model to make predictions about future behavior. For each state encountered, MPC solves an online optimization problem to choose a control action that will minimize future cost. This is a surprisingly effective strategy, but real-time performance requirements warrant the use of simple models. If the model is not sufficiently accurate, then the resulting controller can be biased, limiting performance. We present a framework for improving on MPC with model-free reinforcement learning (RL). The key insight is to view MPC as constructing a series of local Q-function approximations. We show that by using a parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D706 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03bb</mi></math></mjx-assistive-mml></mjx-container>, similar to the trace decay parameter in TD(<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D706 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03bb</mi></math></mjx-assistive-mml></mjx-container>), we can systematically trade-off learned value estimates against the local Q-function approximations. We present a theoretical analysis that shows how error from inaccurate models in MPC and value function estimation in RL can be balanced. We further propose an algorithm that changes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D706 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03bb</mi></math></mjx-assistive-mml></mjx-container> over time to reduce the dependence on MPC as our estimates of the value function improve, and test the efficacy our approach on challenging high-dimensional manipulation tasks with biased models in simulation. We demonstrate that our approach can obtain performance comparable with MPC with access to true dynamics even under severe model bias and is more sample efficient as compared to model-free RL.",
+    "title": "Blending MPC & Value Function Approximation for Efficient Reinforcement Learning",
+    "authors": [
+      "Mohak Bhardwaj",
+      "Sanjiban Choudhury",
+      "Byron Boots"
+    ],
+    "emails": [
+      "~Mohak_Bhardwaj1",
+      "gmail.com",
+      "~Byron_Boots1"
+    ],
+    "rank": 775
+  },
+  {
+    "url": "https://openreview.net/forum?id=YmA86Zo-P_t",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Sequence-to-sequence (seq2seq) learners are widely used, but we still have only limited knowledge about what inductive biases shape the way they generalize. We address that by investigating how popular seq2seq learners generalize in tasks that have high ambiguity in the training data. We use four new tasks  to study learners' preferences for memorization, arithmetic, hierarchical, and compositional reasoning. Further, we connect to Solomonoff's theory of induction and propose to use description length as a principled and sensitive measure of inductive biases. In our experimental study, we find that LSTM-based learners can learn to perform counting, addition, and multiplication by a constant from a single training example. Furthermore, Transformer and LSTM-based learners show a bias toward the hierarchical induction over the linear one, while CNN-based learners prefer the opposite. The latter also show a bias toward a compositional generalization over memorization. Finally, across all our experiments, description length proved to be a sensitive measure of inductive biases.",
+    "title": "What they do when in doubt: a study of inductive biases in seq2seq learners",
+    "authors": [
+      "Eugene Kharitonov",
+      "Rahma Chaabouni"
+    ],
+    "emails": [
+      "~Eugene_Kharitonov1",
+      "~Rahma_Chaabouni1"
+    ],
+    "rank": 776
+  },
+  {
+    "url": "https://openreview.net/forum?id=MmcywoW7PbJ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "It is of significance for an agent to learn a widely applicable and general-purpose policy that can achieve diverse goals including images and text descriptions.  Considering such perceptually-specific goals, the frontier of deep reinforcement learning research is to learn a goal-conditioned policy without hand-crafted rewards. To learn this kind of policy, recent works usually take as the reward the non-parametric distance to a given goal in an explicit embedding space. From a different viewpoint, we propose a novel unsupervised learning approach named goal-conditioned policy with intrinsic motivation (GPIM), which jointly learns both an abstract-level policy and a goal-conditioned policy. The abstract-level policy is conditioned on a latent variable to optimize a discriminator and discovers diverse states that are further rendered into perceptually-specific goals for the goal-conditioned policy. The learned discriminator serves as an intrinsic reward function for the goal-conditioned policy to imitate the trajectory induced by the abstract-level policy.  Experiments on various robotic tasks demonstrate the effectiveness and efficiency of our proposed GPIM method which substantially outperforms prior techniques. ",
+    "title": "Learn Goal-Conditioned Policy with Intrinsic Motivation for Deep Reinforcement Learning",
+    "authors": [
+      "Jinxin Liu",
+      "Donglin Wang",
+      "Qiangxing Tian",
+      "Zhengyu Chen"
+    ],
+    "emails": [
+      "~Donglin_Wang1",
+      "~Jinxin_Liu1",
+      "~Qiangxing_Tian1",
+      "~Zhengyu_Chen2"
+    ],
+    "rank": 777
+  },
+  {
+    "url": "https://openreview.net/forum?id=ESG-DMKQKsD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "We propose a novel task of joint few-shot recognition and novel-view synthesis: given only one or few images of a novel object from arbitrary views with only category annotation, we aim to simultaneously learn an object classifier and generate images of that type of object from new viewpoints. While existing work copes with two or more tasks mainly by multi-task learning of shareable feature representations, we take a different perspective. We focus on the interaction and cooperation between a generative model and a discriminative model, in a way that facilitates knowledge to flow across tasks in complementary directions. To this end, we propose bowtie networks that jointly learn 3D geometric and semantic representations with a feedback loop. Experimental evaluation on challenging fine-grained recognition datasets demonstrates that our synthesized images are realistic from multiple viewpoints and significantly improve recognition performance as ways of data augmentation, especially in the low-data regime. ",
+    "title": "Bowtie Networks: Generative Modeling for Joint Few-Shot Recognition and Novel-View Synthesis",
+    "authors": [
+      "Zhipeng Bao",
+      "Yu-Xiong Wang",
+      "Martial Hebert"
+    ],
+    "emails": [
+      "~Martial_Hebert1",
+      "~Zhipeng_Bao1",
+      "~Yu-Xiong_Wang1"
+    ],
+    "rank": 778
+  },
+  {
+    "url": "https://openreview.net/forum?id=vnlqCDH1b6n",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      8
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Disentangled representation learning has undoubtedly benefited from objective function surgery. However, a delicate balancing act of tuning is still required in order to trade off reconstruction fidelity versus disentanglement. Building on previous successes of penalizing the total correlation in the latent variables, we propose TCWAE (Total Correlation Wasserstein Autoencoder). Working in the WAE paradigm naturally enables the separation of the total-correlation term, thus providing disentanglement control over the learned representation, while offering more flexibility in the choice of reconstruction cost. We propose two variants using different KL estimators and perform extensive quantitative comparisons on data sets with known generative factors, showing competitive results relative to state-of-the-art techniques. We further study the trade off between disentanglement and reconstruction on more-difficult data sets with unknown generative factors, where we expect improved reconstructions due to the flexibility of the WAE paradigm.",
+    "title": "Learning disentangled representations with the Wasserstein Autoencoder",
+    "authors": [
+      "Benoit Gaujac",
+      "Ilya Feige",
+      "David Barber"
+    ],
+    "emails": [
+      "~Ilya_Feige1",
+      "~Benoit_Gaujac1",
+      "~David_Barber2"
+    ],
+    "rank": 779
+  },
+  {
+    "url": "https://openreview.net/forum?id=gIHd-5X324",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      7,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Knowledge distillation is an effective approach to leverage a well-trained network or an ensemble of them, named as the teacher, to guide the training of a student network.  The outputs from the teacher network are used as soft labels for supervising the training of a new network.  Recent studies (M \u0308uller et al., 2019; Yuan et al., 2020) revealed an intriguing property of the soft labels that making labels soft serves as a good regularization to the student network.   From the perspective of statistical learning,  regularization aims to reduce the variance,  however how bias and variance change is not clear for training with soft labels.   In this paper, we investigate the bias-variance tradeoff brought by distillation with soft labels.   Specifically,  we observe that during training the bias-variance tradeoff varies sample-wisely. Further, under the same distillation temperature setting, we observe that the distillation performance is negatively associated with the number of some specific samples, which are named as regularization samples since these samples lead to bias increasing and variance decreasing.  Nevertheless, we empirically find that completely filtering out regularization samples also deteriorates distillation performance.  Our discoveries inspired us to propose the novel weighted soft labels to help the network adaptively handle the sample-wise bias-variance tradeoff.  Experiments on standard evaluation benchmarks validate the effectiveness of our method. Our code is available in the supplementary.",
+    "title": "Rethinking Soft Labels for Knowledge Distillation: A Bias\u2013Variance Tradeoff Perspective",
+    "authors": [
+      "Helong Zhou",
+      "Liangchen Song",
+      "Jiajie Chen",
+      "Ye Zhou",
+      "Guoli Wang",
+      "Junsong Yuan",
+      "Qian Zhang"
+    ],
+    "emails": [
+      "~Helong_Zhou1",
+      "~Liangchen_Song1",
+      "~Jiajie_Chen1",
+      "~Junsong_Yuan2",
+      "~Qian_Zhang7",
+      "~Ye_Zhou2",
+      "~Guoli_Wang2"
+    ],
+    "rank": 780
+  },
+  {
+    "url": "https://openreview.net/forum?id=7IDIy7Jb00l",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Consider the following problem: given the complete training histories of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> conventional RL agents, trained on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> different tasks, design a meta-agent that can quickly maximize reward in a new, unseen task from the same task distribution. In particular, while each conventional RL agent explored and exploited its own different task, the meta-agent must identify regularities in the data that lead to effective exploration/exploitation in the unseen task. This meta-learning problem is an instance of a setting we term Offline Meta Reinforcement Learning (OMRL). To solve our challenge, we take a Bayesian RL (BRL) view, and seek to learn a Bayes-optimal policy from the offline data. We extend the recently proposed VariBAD BRL algorithm to the off-policy setting, and demonstrate learning of approximately Bayes-optimal exploration strategies from offline data using deep neural networks. For the particular problem described above, our method learns effective exploration behavior that is qualitatively different from the exploration used by any RL agent in the data. Furthermore, we find that when applied to the online meta-RL setting (agent simultaneously collects data and improves its meta-RL policy), our method is significantly more sample efficient than the state-of-the-art VariBAD.",
+    "title": "Offline Meta Learning of Exploration",
+    "authors": [
+      "Ron Dorfman",
+      "Aviv Tamar"
+    ],
+    "emails": [
+      "~Aviv_Tamar2",
+      "~Ron_Dorfman2"
+    ],
+    "rank": 781
+  },
+  {
+    "url": "https://openreview.net/forum?id=FsLTUzZlsgT",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      7,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "A learning curve models a classifier\u2019s test error as a function of the number of training samples. Prior works show that learning curves can be used to select model parameters and extrapolate performance. We investigate how to use learning curves to analyze the impact of design choices, such as pre-training, architecture, and data augmentation. We propose a method to robustly estimate learning curves, abstract their parameters into error and data-reliance, and evaluate the effectiveness of different parameterizations. We also provide several interesting observations based on learning curves for a variety of image classification models.",
+    "title": "Learning Curves for Analysis of Deep Networks",
+    "authors": [
+      "Derek Hoiem",
+      "Tanmay Gupta",
+      "Zhizhong Li",
+      "Michal M Shlapentokh-Rothman"
+    ],
+    "emails": [
+      "~Derek_Hoiem1",
+      "~Tanmay_Gupta1",
+      "~Michal_M_Shlapentokh-Rothman1",
+      "~Zhizhong_Li1"
+    ],
+    "rank": 782
+  },
+  {
+    "url": "https://openreview.net/forum?id=mRNkPVHyIVX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      2,
+      5,
+      3
+    ],
+    "abstract": "Recent advances on adversarial defense mainly focus on improving the classifier\u2019s robustness against adversarially perturbed inputs. In this paper, we turn our attention from classifiers to inputs and explore if there exist safe spots in the vicinity of natural images that are robust to adversarial attacks. In this regard, we introduce a novel bi-level optimization algorithm that can find safe spots on over 90% of the correctly classified images for adversarially trained classifiers on CIFAR-10 and ImageNet datasets. Our experiments also show that they can be used to improve both the empirical and certified robustness on smoothed classifiers. Furthermore, by exploiting a novel safe spot inducing model training scheme and our safe spot generation method, we propose a new out-of-distribution detection algorithm which achieves the state of the art results on near-distribution outliers.",
+    "title": "Exploiting Safe Spots in Neural Networks for Preemptive Robustness and Out-of-Distribution Detection",
+    "authors": [
+      "Seungyong Moon",
+      "Gaon An",
+      "Hyun Oh Song"
+    ],
+    "emails": [
+      "~Seungyong_Moon1",
+      "~Hyun_Oh_Song1",
+      "~Gaon_An1"
+    ],
+    "rank": 783
+  },
+  {
+    "url": "https://openreview.net/forum?id=9p2ekP904Rs",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Self-supervised learning has emerged as a strategy to reduce the reliance on costly supervised signal by pretraining representations only using unlabeled data. These methods combine heuristic proxy classification tasks with data augmentations and have achieved significant success, but our theoretical understanding of this success remains limited. In this paper we analyze self-supervised representation learning using a causal framework.  We show how data augmentations can be more effectively utilized through explicit invariance constraints on the proxy classifiers employed during pretraining. Based on this, we propose a novel self-supervised objective, Representation Learning via Invariant Causal Mechanisms (ReLIC), that enforces invariant prediction of proxy targets across augmentations through an invariance regularizer which yields improved generalization guarantees. Further, using causality we generalize contrastive learning, a particular kind of self-supervised method,  and provide an alternative theoretical explanation for the  success  of  these  methods. Empirically, ReLIC significantly outperforms competing methods in terms of robustness and out-of-distribution generalization on ImageNet, while also significantly outperforming these methods on Atari achieving above human-level performance on 51 out of 57 games.",
+    "title": "Representation Learning via Invariant Causal Mechanisms",
+    "authors": [
+      "Jovana Mitrovic",
+      "Brian McWilliams",
+      "Jacob C Walker",
+      "Lars Holger Buesing",
+      "Charles Blundell"
+    ],
+    "emails": [
+      "~Jacob_C_Walker1",
+      "~Lars_Holger_Buesing1",
+      "~Brian_McWilliams2",
+      "~Charles_Blundell1",
+      "~Jovana_Mitrovic1"
+    ],
+    "rank": 784
+  },
+  {
+    "url": "https://openreview.net/forum?id=JFKR3WqwyXR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      4,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      2,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Combinations of neural ODEs with recurrent neural networks (RNN), like GRU-ODE-Bayes or ODE-RNN are well suited to model irregularly observed time series. While those models outperform existing discrete-time approaches, no theoretical guarantees for their predictive capabilities are available. Assuming that the irregularly-sampled time series data originates from a continuous stochastic process, the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>-optimal online prediction is the conditional expectation given the currently available information. We introduce the Neural Jump ODE (NJ-ODE) that provides a data-driven approach to learn, continuously in time, the conditional expectation of a stochastic process. Our approach models the conditional expectation between two observations with a neural ODE and jumps whenever a new observation is made. We define a novel training framework, which allows us to prove theoretical guarantees for the first time. In particular, we show that the output of our model converges to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>-optimal prediction. This can be interpreted as solution to a special filtering problem. We provide experiments showing that the theoretical results also hold empirically. Moreover, we experimentally show that our model outperforms the baselines in more complex learning tasks and give comparisons on real-world datasets.",
+    "title": "Neural Jump Ordinary Differential Equations: Consistent Continuous-Time Prediction and Filtering",
+    "authors": [
+      "Calypso Herrera",
+      "Florian Krach",
+      "Josef Teichmann"
+    ],
+    "emails": [
+      "~Calypso_Herrera1",
+      "~Florian_Krach1",
+      "math.ethz.ch"
+    ],
+    "rank": 785
+  },
+  {
+    "url": "https://openreview.net/forum?id=qClL9hRDSMZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      4,
+      3,
+      1,
+      3
+    ],
+    "abstract": "We investigate gradient descent training of wide neural networks and the corresponding implicit bias in function space. For 1D regression, we show that the solution of training a width-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> shallow ReLU network is within <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>n</mi><mrow><mo>\u2212</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> of the function which fits the training data and whose difference from initialization has smallest 2-norm of the second derivative weighted by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D701 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03b6</mi></math></mjx-assistive-mml></mjx-container>. The curvature penalty function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D701 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03b6</mi></math></mjx-assistive-mml></mjx-container> is expressed in terms of the probability distribution that is utilized to initialize the network parameters, and we compute it explicitly for various common initialization procedures. For instance, asymmetric initialization with a uniform distribution yields a constant curvature penalty, and thence the solution function is the natural cubic spline interpolation of the training data. While similar results have been obtained in previous works, our analysis clarifies important details and allows us to obtain significant generalizations. In particular, the result generalizes to multivariate regression and different activation functions. Moreover, we show that the training trajectories are captured by trajectories of spatially adaptive smoothing splines with decreasing regularization strength. ",
+    "title": "Implicit bias of gradient descent for mean squared error regression with wide neural networks",
+    "authors": [
+      "Hui Jin",
+      "Guido Montufar"
+    ],
+    "emails": [
+      "~Hui_Jin1",
+      "~Guido_Montufar1"
+    ],
+    "rank": 786
+  },
+  {
+    "url": "https://openreview.net/forum?id=DGIXvEAJVd",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      7
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Transformer language models have made tremendous strides in natural language understanding. However, the complexity of natural language makes it challenging to ascertain how accurately these models are tracking the world state underlying the text. Motivated by this issue, we consider the task of language modeling for the game of chess.  Unlike natural language, chess notations describe a simple, constrained, and deterministic domain. Moreover, we observe that chess notation itself allows for directly probing the world state, without requiring any additional probing-related machinery.  Additionally, we have access to a vast number of chess games coupled with the exact state at every move, allowing us to measure the impact of various ways of including grounding during language model training. Overall, we find that with enough training data, transformer language models can learn to track pieces and predict legal moves when trained solely from move sequences. However, in adverse circumstances (small training sets or prediction following long move histories), providing access to board state information during training can yield consistent improvements.",
+    "title": "Learning Chess Blindfolded",
+    "authors": [
+      "Shubham Toshniwal",
+      "Sam Wiseman",
+      "Karen Livescu",
+      "Kevin Gimpel"
+    ],
+    "emails": [
+      "~Kevin_Gimpel1",
+      "~Karen_Livescu1",
+      "~Shubham_Toshniwal1",
+      "~Sam_Wiseman1"
+    ],
+    "rank": 787
+  },
+  {
+    "url": "https://openreview.net/forum?id=9GBZBPn0Jx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5,
+      8
+    ],
+    "rating": "6.07",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Motion forecasting is essential for making intelligent decisions in robotic navigation. As a result, the multi-agent behavioral prediction has become a core component of modern human-robot interaction applications such as autonomous driving. Due to various intentions and interactions among agents, agent trajectories can have multiple possible futures. Hence, the motion forecasting model's ability to cover possible modes becomes essential to enable accurate prediction. Towards this goal, we introduce HalentNet to better model the future motion distribution in addition to a traditional trajectory regression learning objective by incorporating generative augmentation losses. We model intents with unsupervised discrete random variables whose training is guided by a collaboration between two key signals: A discriminative loss that encourages intents' diversity and a hallucinative loss that explores intent transitions (i.e., mixed intents) and encourages their smoothness. This regulates the neural network behavior to be more accurately predictive on uncertain scenarios due to the active yet careful exploration of possible future agent behavior. Our model's learned representation leads to better and more semantically meaningful coverage of the trajectory distribution. Our experiments show that our method can improve over the state-of-the-art trajectory forecasting benchmarks, including vehicles and pedestrians, for about 20% on average FDE and 50% on road boundary violation rate when predicting 6 seconds future. We also conducted human experiments to show that our predicted trajectories received 39.6% more votes than the runner-up approach and 32.2% more votes than our variant without hallucinative mixed intent loss. The code will be released soon. ",
+    "title": "HalentNet: Multimodal Trajectory Forecasting with Hallucinative Intents",
+    "authors": [
+      "Deyao Zhu",
+      "Mohamed Zahran",
+      "Li Erran Li",
+      "Mohamed Elhoseiny"
+    ],
+    "emails": [
+      "~Mohamed_Zahran1",
+      "~Mohamed_Elhoseiny1",
+      "~Li_Erran_Li1",
+      "~Deyao_Zhu1"
+    ],
+    "rank": 788
+  },
+  {
+    "url": "https://openreview.net/forum?id=mj7WsaHYxj",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.07",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Data augmentation helps neural networks generalize better, but it remains an open question how to effectively augment graph data to enhance the performance of GNNs (Graph Neural Networks). While most existing graph regularizers focus on augmenting graph topological structures by adding/removing edges, we offer a novel direction to augment in the input node feature space for better performance. We propose a simple but effective solution, FLAG (Free Large-scale Adversarial Augmentation on Graphs), which iteratively augments node features with gradient-based adversarial perturbations during training, and boosts performance at test time. Empirically, FLAG can be easily implemented with a dozen lines of code and is flexible enough to function with any GNN backbone, on a wide variety of large-scale datasets, and in both transductive and inductive settings. Without modifying a model's architecture or training setup, FLAG yields a consistent and salient performance boost across both node and graph classification tasks. Using FLAG, we reach state-of-the-art performance on the large-scale ogbg-molpcba, ogbg-ppa, and ogbg-code datasets.  ",
+    "title": "FLAG: Adversarial Data Augmentation for Graph Neural Networks",
+    "authors": [
+      "Kezhi Kong",
+      "Guohao Li",
+      "Mucong Ding",
+      "Zuxuan Wu",
+      "Chen Zhu",
+      "Bernard Ghanem",
+      "Gavin Taylor",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Mucong_Ding1",
+      "~Kezhi_Kong1",
+      "~Chen_Zhu2",
+      "~Zuxuan_Wu1",
+      "~Guohao_Li1",
+      "~Bernard_Ghanem1",
+      "~Tom_Goldstein1",
+      "~Gavin_Taylor1"
+    ],
+    "rank": 789
+  },
+  {
+    "url": "https://openreview.net/forum?id=D9pSaTGUemb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      6
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "We study the implicit acceleration of gradient flow in over-parameterized two-layer linear models. We show that implicit acceleration emerges from a conservation law that constrains the dynamics to follow certain trajectories. More precisely, gradient flow preserves the difference of the Gramian~matrices of the input and output weights and we show that the amount of acceleration depends on both the magnitude of that difference (which is fixed at initialization) and the spectrum of the data. In addition, and generalizing prior work, we prove our results without assuming small, balanced or spectral initialization for the weights, and establish interesting connections between the matrix factorization problem and Riccati type differential equations.",
+    "title": "Implicit Acceleration of Gradient Flow in Overparameterized Linear Models",
+    "authors": [
+      "Salma Tarmoun",
+      "Guilherme Fran\u00e7a",
+      "Benjamin David Haeffele",
+      "Rene Vidal"
+    ],
+    "emails": [
+      "~Rene_Vidal1",
+      "~Salma_Tarmoun1",
+      "~Benjamin_David_Haeffele1",
+      "~Guilherme_Fran\u00e7a1"
+    ],
+    "rank": 790
+  },
+  {
+    "url": "https://openreview.net/forum?id=5jzlpHvvRk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Designing proper loss functions for vision tasks has been a long-standing research direction to advance the capability of existing models. For object detection, the well-established classification and regression loss functions have been carefully designed by considering diverse learning challenges (e.g. class imbalance, hard negative samples, and scale variances). Inspired by the recent progress in network architecture search, it is interesting to explore the possibility of discovering new loss function formulations via directly searching the primitive operation combinations. So that the learned losses not only fit for diverse object detection challenges to alleviate huge human efforts, but also have better alignment with evaluation metric and good mathematical convergence property. Beyond the previous auto-loss works on face recognition and image classification, our work makes the first attempt to discover new loss functions for the challenging object detection from primitive operation levels and finds the searched losses are insightful. We propose an effective convergence-simulation driven evolutionary search algorithm, called CSE-Autoloss, for speeding up the search progress by regularizing the mathematical rationality of loss candidates via two progressive convergence simulation modules: convergence property verification and model optimization simulation. CSE-Autoloss involves the search space (i.e. 21 mathematical operators, 3 constant-type inputs, and 3 variable-type inputs) that cover a wide range of the possible variants of existing losses and discovers best-searched loss function combination within a short time (around 1.5 wall-clock days with 20x speedup in comparison to the vanilla evolutionary algorithm). We conduct extensive evaluations of loss function search on popular detectors and validate the good generalization capability of searched losses across diverse architectures and various datasets. Our experiments show that the best-discovered loss function combinations outperform default combinations (Cross-entropy/Focal loss for classification and L1 loss for regression) by 1.1% and 0.8% in terms of mAP for two-stage and one-stage detectors on COCO respectively. Our searched losses are available at <a href=\"https://github.com/PerdonLiu/CSE-Autoloss\" target=\"_blank\" rel=\"nofollow\">https://github.com/PerdonLiu/CSE-Autoloss</a>.",
+    "title": "Loss Function Discovery for Object Detection via Convergence-Simulation Driven Search",
+    "authors": [
+      "Peidong Liu",
+      "Gengwei Zhang",
+      "Bochao Wang",
+      "Hang Xu",
+      "Xiaodan Liang",
+      "Yong Jiang",
+      "Zhenguo Li"
+    ],
+    "emails": [
+      "~Peidong_Liu2",
+      "~Bochao_Wang2",
+      "~Gengwei_Zhang1",
+      "~Yong_Jiang3",
+      "~Hang_Xu1",
+      "~Xiaodan_Liang2",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 791
+  },
+  {
+    "url": "https://openreview.net/forum?id=Wi5KUNlqWty",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      5,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Attention mechanism in graph neural networks is designed to assign larger weights to important neighbor nodes for better representation. However, what graph attention learns is not understood well, particularly when graphs are noisy. In this paper, we propose a self-supervised graph attention network (SuperGAT), an improved graph attention model for noisy graphs. Specifically, we exploit two attention forms compatible with a self-supervised task to predict edges, whose presence and absence contain the inherent information about the importance of the relationships between nodes. By encoding edges, SuperGAT learns more expressive attention in distinguishing mislinked neighbors. We find two graph characteristics influence the effectiveness of attention forms and self-supervision: homophily and average degree. Thus, our recipe provides guidance on which attention design to use when those two graph characteristics are known. Our experiment on 17 real-world datasets demonstrates that our recipe generalizes across 15 datasets of them, and our models designed by recipe show improved performance over baselines.",
+    "title": "How to Find Your Friendly Neighborhood: Graph Attention Design with Self-Supervision",
+    "authors": [
+      "Dongkwan Kim",
+      "Alice Oh"
+    ],
+    "emails": [
+      "~Alice_Oh1",
+      "~Dongkwan_Kim1"
+    ],
+    "rank": 792
+  },
+  {
+    "url": "https://openreview.net/forum?id=SPhswbiXpJQ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      6,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      3,
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Compiler architects increasingly look to machine learning when building heuristics for compiler optimization. The promise of automatic heuristic design, freeing the compiler engineer from the complex interactions of program, architecture, and other optimizations, is alluring. However, most machine learning methods cannot replicate even the simplest of the abstract interpretations of data flow analysis that are critical to making good optimization decisions. This must change for machine learning to become the dominant technology in compiler heuristics.\n\nTo this end, we propose ProGraML - Program Graphs for Machine Learning - a language-independent, portable representation of whole-program semantics for deep learning. To benchmark current and future learning techniques for compiler analyses we introduce an open dataset of 461k Intermediate Representation (IR) files for LLVM, covering five source programming languages, and 15.4M corresponding data flow results. We formulate data flow analysis as an MPNN and show that, using ProGraML, standard analyses can be learned, yielding improved performance on downstream compiler optimization tasks.",
+    "title": "Deep Data Flow Analysis",
+    "authors": [
+      "Chris Cummins",
+      "Zacharias Fisches",
+      "Tal Ben-Nun",
+      "Torsten Hoefler",
+      "Hugh Leather",
+      "Michael O'Boyle"
+    ],
+    "emails": [
+      "inf.ethz.ch",
+      "~Tal_Ben-Nun1",
+      "student.ethz.ch",
+      "~Michael_O'Boyle1",
+      "fb.com"
+    ],
+    "rank": 793
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ma0S4RcfpR_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      8,
+      5
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "The grid cells in the mammalian medial entorhinal cortex exhibit striking hexagon firing patterns when the agent navigates in the open field. It is hypothesized that the grid cells are involved in path integration so that the agent is aware of its self-position by accumulating its self-motion. Assuming the grid cells form a vector representation of self-position, we elucidate a minimally simple recurrent model for grid cells' path integration based on two coupled matrix Lie algebras that underlie two coupled rotation systems that mirror the agent's self-motion: (1) When the agent moves along a certain direction, the vector is rotated by a generator matrix. (2) When the agent changes direction, the generator matrix is rotated by another generator matrix. Our experiments show that our model learns hexagonal grid response patterns that resemble the firing patterns observed from the grid cells in the brain. Furthermore, the learned model is capable of near exact path integration, and it is also capable of error correction. Our model is novel and simple, with explicit geometric and algebraic structures. ",
+    "title": "A Representational Model of Grid Cells' Path Integration Based on Matrix Lie Algebras",
+    "authors": [
+      "Ruiqi Gao",
+      "Jianwen Xie",
+      "Xue-Xin Wei",
+      "Song-Chun Zhu",
+      "Ying Nian Wu"
+    ],
+    "emails": [
+      "~Xue-Xin_Wei2",
+      "~Jianwen_Xie1",
+      "~Song-Chun_Zhu1",
+      "~Ruiqi_Gao2",
+      "~Ying_Nian_Wu1"
+    ],
+    "rank": 794
+  },
+  {
+    "url": "https://openreview.net/forum?id=T6AxtOaWydQ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      3,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Contrastive representation learning has shown to be effective to learn representations from unlabeled data. However, much progress has been made in vision domains relying on data augmentations carefully designed using domain knowledge. In this work, we propose i-Mix, a simple yet effective domain-agnostic regularization strategy for improving contrastive representation learning. We cast contrastive learning as training a non-parametric classifier by assigning a unique virtual class to each data in a batch. Then, data instances are mixed in both the input and virtual label spaces, providing more augmented data during training. In experiments, we demonstrate that i-Mix consistently improves the quality of learned representations across domains, including image, speech, and tabular data. Furthermore, we confirm its regularization effect via extensive ablation studies across model and dataset sizes. The code is available at <a href=\"https://github.com/kibok90/imix\" target=\"_blank\" rel=\"nofollow\">https://github.com/kibok90/imix</a>.",
+    "title": "i-Mix: A Domain-Agnostic Strategy for Contrastive Representation Learning",
+    "authors": [
+      "Kibok Lee",
+      "Yian Zhu",
+      "Kihyuk Sohn",
+      "Chun-Liang Li",
+      "Jinwoo Shin",
+      "Honglak Lee"
+    ],
+    "emails": [
+      "~Kihyuk_Sohn1",
+      "~Kibok_Lee1",
+      "~Jinwoo_Shin1",
+      "~Chun-Liang_Li1",
+      "~Honglak_Lee2",
+      "umich.edu"
+    ],
+    "rank": 795
+  },
+  {
+    "url": "https://openreview.net/forum?id=-mWcQVLPSPy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      4
+    ],
+    "rating": "6.06",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Zero-shot learning (ZSL) aims to classify images of an unseen class only based on a few attributes describing that class but no access to any training sample. A popular strategy is to learn a mapping between the semantic space of class attributes and the visual space of images based on the seen classes and their data. Thus, an unseen class image can be ideally mapped to its corresponding class attributes. The key challenge is how to align the representations in the two spaces. For most ZSL settings, the attributes for each seen/unseen class are only represented by a vector while the seen-class data provide much more information. Thus, the imbalanced supervision from the semantic and the visual space can make the learned mapping easily overfitting to the seen classes. To resolve this problem, we propose Isometric Propagation Network (IPN), which learns to strengthen the relation between classes within each space and align the class dependency in the two spaces. Specifically, IPN learns to propagate the class representations on an auto-generated graph within each space. In contrast to only aligning the resulted static representation, we regularize the two dynamic propagation procedures to be isometric in terms of the two graphs' edge weights per step by minimizing a consistency loss between them. IPN achieves state-of-the-art performance on three popular ZSL benchmarks. To evaluate the generalization capability of IPN, we further build two larger benchmarks with more diverse unseen classes and demonstrate the advantages of IPN on them.",
+    "title": "Isometric Propagation Network for Generalized Zero-shot Learning",
+    "authors": [
+      "Lu Liu",
+      "Tianyi Zhou",
+      "Guodong Long",
+      "Jing Jiang",
+      "Xuanyi Dong",
+      "Chengqi Zhang"
+    ],
+    "emails": [
+      "~Xuanyi_Dong1",
+      "~Guodong_Long2",
+      "~Tianyi_Zhou1",
+      "~Lu_Liu7",
+      "~Chengqi_Zhang1",
+      "~Jing_Jiang6"
+    ],
+    "rank": 796
+  },
+  {
+    "url": "https://openreview.net/forum?id=wQRlSUZ5V7B",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We present a principled approach to incorporating labels in variational autoencoders (VAEs) that captures the rich characteristic information associated with those labels. While prior work has typically conflated these by learning latent variables that directly correspond to label values, we argue this is contrary to the intended effect of supervision in VAEs\u2014capturing rich label characteristics with the latents. For example, we may want to capture the characteristics of a face that make it look young, rather than just the age of the person. To this end, we develop a novel VAE model, the characteristic capturing VAE (CCVAE), which \u201creparameterizes\u201d supervision through auxiliary variables and a concomitant variational objective. Through judicious structuring of mappings between latent and auxiliary variables, we show that the CCVAE can effectively learn meaningful representations of the characteristics of interest across a variety of supervision schemes. In particular, we show that the CCVAE allows for more effective and more general interventions to be performed, such as smooth traversals within the characteristics for a given label, diverse conditional generation, and transferring characteristics across datapoints.",
+    "title": "Capturing Label Characteristics in VAEs",
+    "authors": [
+      "Tom Joy",
+      "Sebastian Schmon",
+      "Philip Torr",
+      "Siddharth N",
+      "Tom Rainforth"
+    ],
+    "emails": [
+      "~Siddharth_N1",
+      "~Tom_Joy1",
+      "gmail.com",
+      "~Tom_Rainforth1",
+      "~Philip_Torr1"
+    ],
+    "rank": 797
+  },
+  {
+    "url": "https://openreview.net/forum?id=JHcqXGaqiGn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      6,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Graph neural networks have been widely used on modeling graph data, achieving impressive results on node classification and link prediction tasks. Yet, obtaining an accurate representation for a graph further requires a pooling function that maps a set of node representations into a compact form. A simple sum or average over all node representations considers all node features equally without consideration of their task relevance, and any structural dependencies among them. Recently proposed hierarchical graph pooling methods, on the other hand, may yield the same representation for two different graphs that are distinguished by the Weisfeiler-Lehman test, as they suboptimally preserve information from the node features. To tackle these limitations of existing graph pooling methods, we first formulate the graph pooling problem as a multiset encoding problem with auxiliary information about the graph structure, and propose a Graph Multiset Transformer (GMT) which is a multi-head attention based global pooling layer that captures the interaction between nodes according to their structural dependencies. We show that GMT satisfies both injectiveness and permutation invariance, such that it is at most as powerful as the Weisfeiler-Lehman graph isomorphism test. Moreover, our methods can be easily extended to the previous node clustering approaches for hierarchical graph pooling. Our experimental results show that GMT significantly outperforms state-of-the-art graph pooling methods on graph classification benchmarks with high memory and time efficiency, and obtains even larger performance gain on graph reconstruction and generation tasks.",
+    "title": "Accurate Learning of Graph Representations with Graph Multiset Pooling",
+    "authors": [
+      "Jinheon Baek",
+      "Minki Kang",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Sung_Ju_Hwang1",
+      "~Minki_Kang1",
+      "~Jinheon_Baek1"
+    ],
+    "rank": 798
+  },
+  {
+    "url": "https://openreview.net/forum?id=FlhlcARywRz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "How do we build a general and broad object detection system? We use all labels of all concepts ever annotated. These labels span many diverse datasets with potentially inconsistent semantic labels. In this paper, we show how to integrate these datasets and their semantic taxonomies in a completely automated fashion. Once integrated, we train an off-the-shelf object detector on the union of the datasets. This unified recognition system performs as well as dataset-specific models on each training domain, but generalizes much better to new unseen domains. Entries based on the presented methodology ranked first in the object detection and instance segmentation tracks of the ECCV 2020 Robust Vision Challenge.",
+    "title": "Learning a unified label space",
+    "authors": [
+      "Xingyi Zhou",
+      "Vladlen Koltun",
+      "Philipp Kraehenbuehl"
+    ],
+    "emails": [
+      "~Vladlen_Koltun1",
+      "~Xingyi_Zhou2",
+      "~Philipp_Kraehenbuehl1"
+    ],
+    "rank": 799
+  },
+  {
+    "url": "https://openreview.net/forum?id=u8X280hw1Mt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      8
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "In this paper, we propose a method, named EqCo (Equivalent Rules for Contrastive Learning), to make self-supervised learning irrelevant to the number of negative samples in the contrastive learning framework. Inspired by the InfoMax principle, we point that the margin term in contrastive loss needs to be adaptively scaled according to the number of negative pairs in order to keep steady mutual information bound and gradient magnitude. EqCo bridges the performance gap among a wide range of negative sample sizes, so that for the first time, we can use only a few negative pairs (e.g. 16 per query) to perform self-supervised contrastive training on large-scale vision datasets like ImageNet, while with almost no accuracy drop. This is quite a contrast to the widely used large batch training or memory bank mechanism in current practices. Equipped with EqCo, our simplified MoCo (SiMo) achieves comparable accuracy with MoCo v2 on ImageNet (linear evaluation protocol) while only involves 16 negative pairs per query instead of 65536, suggesting that large quantities of negative samples might not be a critical factor in contrastive learning frameworks.",
+    "title": "EqCo:   Equivalent Rules for Self-supervised Contrastive Learning",
+    "authors": [
+      "Benjin Zhu",
+      "Junqiang Huang",
+      "Zeming Li",
+      "Xiangyu Zhang",
+      "Jian Sun"
+    ],
+    "emails": [
+      "~Xiangyu_Zhang1",
+      "~Jian_Sun4",
+      "~Benjin_Zhu1",
+      "~Zeming_Li2",
+      "~Junqiang_Huang1"
+    ],
+    "rank": 800
+  },
+  {
+    "url": "https://openreview.net/forum?id=VD_ozqvBy4W",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      7,
+      8
+    ],
+    "rating": "6.06",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Pretrained Transformer-based language models (LMs) display remarkable natural language generation capabilities. With their immense potential, controlling text generation of such LMs is getting attention. While there are studies that seek to control high-level attributes (such as sentiment and topic) of generated text, there is still a lack of more precise control over its content at the word- and phrase-level. Here, we propose Content-Conditioner (CoCon) to control an LM's output text with a content input, at a fine-grained level. In our self-supervised approach, the CoCon block learns to help the LM complete a partially-observed text sequence by conditioning with content inputs that are withheld from the LM. Through experiments, we show that CoCon can naturally incorporate target content into generated texts and control high-level text attributes in a zero-shot manner.",
+    "title": "CoCon: A Self-Supervised Approach for Controlled Text Generation",
+    "authors": [
+      "Alvin Chan",
+      "Yew-Soon Ong",
+      "Bill Pung",
+      "Aston Zhang",
+      "Jie Fu"
+    ],
+    "emails": [
+      "~Aston_Zhang2",
+      "e.ntu.edu.sg",
+      "~Yew-Soon_Ong1",
+      "~Jie_Fu2",
+      "~Alvin_Chan1"
+    ],
+    "rank": 801
+  },
+  {
+    "url": "https://openreview.net/forum?id=bIrL42I_NF8",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      7
+    ],
+    "rating": "6.06",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Decentralized training of deep learning models enables on-device learning over networks, as well as efficient scaling to large compute clusters. Experiments in earlier works revealed that decentralized training often suffers from generalization issues: the performance of models trained in a decentralized fashion is in general worse than the performance of models trained in a centralized fashion, and this generalization gap is impacted by parameters such as network size, communication topology, and data partitioning.\n\nWe identify the changing consensus distance between devices as a key parameter to explain the gap between centralized and decentralized training. We show that when the consensus distance does not grow too large, the performance of centralized training can be reached and sometimes surpassed. We highlight the intimate interplay between network topology and learning rate at the different training phases and discuss the implications for communication efficient training schemes. Our insights into the generalization gap in decentralized deep learning allow the principled design of better training schemes that mitigate these effects.\n    ",
+    "title": "On the Effect of Consensus in Decentralized Deep Learning",
+    "authors": [
+      "Tao Lin",
+      "Lingjing Kong",
+      "Anastasia Koloskova",
+      "Martin Jaggi",
+      "Sebastian U Stich"
+    ],
+    "emails": [
+      "~Sebastian_U_Stich1",
+      "~Martin_Jaggi1",
+      "~Anastasia_Koloskova2",
+      "epfl.ch",
+      "~Tao_Lin1"
+    ],
+    "rank": 802
+  },
+  {
+    "url": "https://openreview.net/forum?id=oyZxhRI2RiE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      7,
+      7,
+      7
+    ],
+    "rating": "6.05",
+    "confidences": [
+      5,
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Conversational Semantic Parsing (CSP) is the task of converting a sequence of natural language queries to formal language (e.g., SQL, SPARQL) that can be executed against a structured ontology (e.g.  databases, knowledge bases).  To accomplish  this  task,  a  CSP  system  needs  to  model  the  relation  between  the unstructured language utterance and the structured ontology while representing the multi-turn dynamics of the dialog. Pre-trained language models (LMs) are the state-of-the-art for various natural language processing tasks. However, existing pre-trained LMs that use language modeling training objectives over free-form text have limited ability to represent natural language references to contextual structural data. In this work, we present SCORE, a new pre-training approach for CSP tasks designed to induce representations that capture the alignment between the dialogue flow and the structural context. We demonstrate the broad applicability of SCORE to CSP tasks by combining SCORE with strong base systems on four different tasks (SPARC, COSQL, MWOZ, and SQA). We show that SCORE can improve the performance over all these base systems by a significant margin and achieves state-of-the-art results on three of them.",
+    "title": "SCoRe: Pre-Training for Context Representation in Conversational Semantic Parsing",
+    "authors": [
+      "Tao Yu",
+      "Rui Zhang",
+      "Alex Polozov",
+      "Christopher Meek",
+      "Ahmed Hassan Awadallah"
+    ],
+    "emails": [
+      "~Rui_Zhang7",
+      "~Ahmed_Hassan_Awadallah1",
+      "~Christopher_Meek1",
+      "~Tao_Yu5",
+      "~Alex_Polozov1"
+    ],
+    "rank": 803
+  },
+  {
+    "url": "https://openreview.net/forum?id=YCXrx6rRCXO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.05",
+    "confidences": [
+      5,
+      4,
+      5,
+      2,
+      3
+    ],
+    "abstract": "We propose a fast, distance-preserving, binary embedding algorithm to transform a high-dimensional dataset <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c54 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2286\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">T</mi></mrow><mo>\u2286</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup></math></mjx-assistive-mml></mjx-container> into binary sequences in the cube <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo fence=\"false\" stretchy=\"false\">{</mo><mo>\u00b1</mo><mn>1</mn><msup><mo fence=\"false\" stretchy=\"false\">}</mo><mi>m</mi></msup></math></mjx-assistive-mml></mjx-container>. When <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c54 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">T</mi></mrow></math></mjx-assistive-mml></mjx-container> consists of well-spread (i.e., non-sparse) vectors, our embedding method applies a stable noise-shaping quantization scheme to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi><mi>x</mi></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><mi>m</mi><mo>\u00d7</mo><mi>n</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> is a sparse Gaussian random matrix. This contrasts with most binary embedding methods, which usually use <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c21A6\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c73\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c69\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi><mo stretchy=\"false\">\u21a6</mo><mrow><mi mathvariant=\"normal\">s</mi><mi mathvariant=\"normal\">i</mi><mi mathvariant=\"normal\">g</mi><mi mathvariant=\"normal\">n</mi></mrow><mo stretchy=\"false\">(</mo><mi>A</mi><mi>x</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for the embedding. Moreover, we show that Euclidean distances among the elements of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c54 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">T</mi></mrow></math></mjx-assistive-mml></mjx-container> are approximated by the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> norm on the images of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cB1\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo fence=\"false\" stretchy=\"false\">{</mo><mo>\u00b1</mo><mn>1</mn><msup><mo fence=\"false\" stretchy=\"false\">}</mo><mi>m</mi></msup></math></mjx-assistive-mml></mjx-container> under a fast linear transformation. This again contrasts with standard methods, where the Hamming distance is used instead.  Our method is both fast and memory efficient, with time complexity  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>m</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and space complexity <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>m</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> on well-spread data. When the data is not well-spread, we show that the approach still works provided that data is transformed via a Walsh-Hadamard matrix, but now the cost is <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>n</mi><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> per data point.  Further, we prove that the method is accurate and its associated error is comparable to that of a continuous valued Johnson-Lindenstrauss embedding plus a quantization error that admits a polynomial decay as the embedding dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container> increases.\n\tThus the length of the binary codes required to achieve a desired accuracy is quite small, and we show it can even be compressed further without compromising the accuracy. To illustrate our results, we test the proposed method on natural images and show that it achieves strong performance.",
+    "title": "Faster Binary Embeddings for Preserving Euclidean Distances",
+    "authors": [
+      "Jinjie Zhang",
+      "Rayan Saab"
+    ],
+    "emails": [
+      "~Jinjie_Zhang1",
+      "~Rayan_Saab1"
+    ],
+    "rank": 804
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y-Wl1l0Va-",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "We propose the k-Shortest-Path (k-SP) constraint: a novel constraint on the agent\u2019s trajectory that improves the sample-efficiency in sparse-reward MDPs. We show that any optimal policy necessarily satisfies the k-SP constraint. Notably, the k-SP constraint prevents the policy from exploring state-action pairs along the non-k-SP trajectories (e.g., going back and forth).  However, in practice, excluding state-action pairs may hinder convergence of many RL algorithms. To overcome this, we propose a novel cost function that penalizes the policy violating SP constraint, instead of completely excluding it.  Our numerical experiment in a tabular RL setting demonstrate that the SP constraint can significantly reduce the trajectory space of policy. As a result, our constraint enables more sample efficient learning by suppressing redundant exploration and exploitation. Our empirical experiment results on MiniGrid and DeepMind Lab show that the proposed method significantly improves proximal policy optimization (PPO) and outperforms existing novelty-seeking exploration methods including count-based exploration, indicating that it improves the sample efficiency by preventing the agent from taking redundant actions.",
+    "title": "Shortest-Path Constrained Reinforcement Learning for Sparse Reward Tasks",
+    "authors": [
+      "Sungryull Sohn",
+      "Sungtae Lee",
+      "Jongwook Choi",
+      "Harm van Seijen",
+      "Honglak Lee",
+      "Mehdi Fatemi"
+    ],
+    "emails": [
+      "~Mehdi_Fatemi1",
+      "~Sungryull_Sohn1",
+      "~Jongwook_Choi1",
+      "~Sungtae_Lee1",
+      "~Harm_van_Seijen1",
+      "~Honglak_Lee2"
+    ],
+    "rank": 805
+  },
+  {
+    "url": "https://openreview.net/forum?id=DlPnp5_1JMI",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural ordinary differential equations (neural ODEs) introduced an approach to approximate a neural network as a system of ODEs after considering its layer as a continuous variable and discretizing its hidden dimension. While having several good characteristics, neural ODEs are known to be numerically unstable and slow in solving their integral problems, resulting in errors and/or much computation of the forward-pass inference. In this work, we present a novel partial differential equation (PDE)-based approach that removes the necessity of solving integral problems and considers both the layer and the hidden dimension as continuous variables. Owing to the recent advancement of learning PDEs, the presented novel concept, called PR-Net, can be implemented. Our method shows comparable (or better) accuracy and robustness in much shorter forward-pass inference time for various datasets and tasks in comparison with neural ODEs and Isometric MobileNet V3. For the efficient nature of PR-Net, it is suitable to be deployed in resource-scarce environments, e.g., deploying instead of MobileNet.",
+    "title": "PDE-regularized Neural Networks for Image Classification",
+    "authors": [
+      "Jungeun Kim",
+      "Seunghyun Hwang",
+      "Jihyun Hwang",
+      "Kookjin Lee",
+      "Dongeun Lee",
+      "Noseong Park"
+    ],
+    "emails": [
+      "gmail.com",
+      "sandia.gov",
+      "yonsei.ac.kr",
+      "tamuc.edu",
+      "~Noseong_Park1"
+    ],
+    "rank": 806
+  },
+  {
+    "url": "https://openreview.net/forum?id=6VhmvP7XZue",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Supervised and semi-supervised learning methods have been traditionally designed for the closed-world setting which is based on the assumption that unlabeled test data contains only classes previously encountered in the labeled training data. However, the real world is often open and dynamic, and thus novel previously unseen classes may appear in the test data or during the model deployment. Here, we introduce a new open-world semi-supervised learning setting in which the model is required to recognize previously seen classes, as well as to discover novel classes never seen in the labeled dataset. To tackle the problem, we propose ORCA, an approach that jointly learns a feature representation and a classifier on the labeled and unlabeled subsets of the data. The key idea in ORCA is in introducing uncertainty based adaptive margin that effectively circumvents the bias caused by the imbalance of variance between seen and novel classes. We demonstrate that ORCA accurately discovers novel classes and assigns samples to previously seen classes on standard benchmark image classification datasets, including  CIFAR and ImageNet. Remarkably, despite solving the harder task ORCA outperforms semi-supervised methods on seen classes, as well as novel class discovery methods on unseen classes, achieving 7% and 151% improvements on seen and unseen classes of the ImageNet dataset.",
+    "title": "Open-world Semi-supervised Learning",
+    "authors": [
+      "Kaidi Cao",
+      "Maria Brbic",
+      "Jure Leskovec"
+    ],
+    "emails": [
+      "cs.stanford.edu",
+      "~Jure_Leskovec1",
+      "~Kaidi_Cao1"
+    ],
+    "rank": 807
+  },
+  {
+    "url": "https://openreview.net/forum?id=luGQiBeRMxd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      2,
+      5
+    ],
+    "abstract": "We present a new method for score-based adversarial attack, where the attacker queries the loss-oracle of the target model.\nOur method employs a parameterized search space with a structure that captures the relationship of the gradient of the loss function. We show that searching over the structured space can be approximated by a time-varying contextual bandits problem, where the attacker takes feature of the associated arm to make modifications of the input, and receives an immediate reward as the reduction of the loss function. The time-varying contextual bandits problem can then be solved by a Bayesian optimization procedure, which can take advantage of the features of the structured action space. The experiments on ImageNet and the Google Cloud Vision API demonstrate that the proposed method achieves the state of the art success rates and query efficiencies for both undefended and defended models.",
+    "title": "CorrAttack: Black-box Adversarial Attack with Structured Search",
+    "authors": [
+      "Zhichao Huang",
+      "Yaowei Huang",
+      "Tong Zhang"
+    ],
+    "emails": [
+      "~Tong_Zhang2",
+      "~Yaowei_Huang2",
+      "~Zhichao_Huang1"
+    ],
+    "rank": 808
+  },
+  {
+    "url": "https://openreview.net/forum?id=jXe91kq3jAq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "To quickly solve new tasks in complex environments, intelligent agents need to build up reusable knowledge. For example, a learned world model captures knowledge about the environment that applies to new tasks. Similarly, skills capture general behaviors that can apply to new tasks. In this paper, we investigate how these two approaches can be integrated into a single reinforcement learning agent. Specifically, we leverage the idea of partial amortization for fast adaptation at test time. For this, actions are produced by a policy that is learned over time while the skills it conditions on are chosen using online planning. We demonstrate the benefits of our design decisions across a suite of challenging locomotion tasks and demonstrate improved sample efficiency in single tasks as well as in transfer from one task to another, as compared to competitive baselines. Videos are available at: <a href=\"https://sites.google.com/view/latent-skill-planning/\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/latent-skill-planning/</a>",
+    "title": "Latent Skill Planning for Exploration and Transfer",
+    "authors": [
+      "Kevin Xie",
+      "Homanga Bharadhwaj",
+      "Danijar Hafner",
+      "Animesh Garg",
+      "Florian Shkurti"
+    ],
+    "emails": [
+      "~Kevin_Xie1",
+      "~Danijar_Hafner1",
+      "~Homanga_Bharadhwaj1",
+      "~Florian_Shkurti1",
+      "~Animesh_Garg1"
+    ],
+    "rank": 809
+  },
+  {
+    "url": "https://openreview.net/forum?id=rryJiPXifr",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      4,
+      1,
+      5
+    ],
+    "abstract": "3D Convolutional Neural Networks (3D ConvNets) have been regarded as a powerful class of models for video recognition. Nevertheless, it is not trivial to optimally learn a 3D ConvNets due to high complexity and various options of the training scheme. The most common hand-tuning process starts from learning 3D ConvNets using short video clips and then is followed by learning long-term temporal dependency using lengthy clips, while gradually decaying the learning rate from high to low as training progresses. The fact that such process comes along with several heuristic settings motivates the study to seek an optimal ``path'' to automate the entire training. In this paper, we decompose the path into a series of training ``states'' and specify the hyper-parameters, e.g., learning rate and the length of input clips, in each state. The estimation of the knee point on the performance-epoch curve triggers the transition from one state to another. We perform dynamic programming over all the candidate states to plan the optimal permutation of states, i.e., optimization path. Furthermore, we devise a new 3D ConvNets with a unique design of dual-head classifier to improve the spatial and temporal discrimination. Extensive experiments conducted on seven public video recognition benchmarks demonstrate the advantages of our proposal. With the optimization planning, our 3D ConvNets achieves superior results when comparing to the state-of-the-art video recognition approaches. More remarkably, we obtain the top-1 accuracy of 82.5% and 84.3% on the large-scale Kinetics-400 and Kinetics-600 datasets, respectively.",
+    "title": "Optimization Planning for 3D ConvNets",
+    "authors": [
+      "Zhaofan Qiu",
+      "Ting Yao",
+      "Chong-wah Ngo",
+      "Tao Mei"
+    ],
+    "emails": [
+      "~Chong-wah_Ngo2",
+      "~Ting_Yao1",
+      "~Tao_Mei3",
+      "~Zhaofan_Qiu2"
+    ],
+    "rank": 810
+  },
+  {
+    "url": "https://openreview.net/forum?id=czv8Ac3Kg7l",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Large, multi-dimensional spatio-temporal datasets are omnipresent in modern science and engineering. An effective framework for handling such data are Gaussian process deep generative models (GP-DGMs), which employ GP priors over the latent variables of DGMs. Existing approaches for performing inference in GP-DGMs do not support sparse GP approximations based on inducing points, which are essential for the computational efficiency of GPs, nor do they handle missing data -- a natural occurrence in many spatio-temporal datasets -- in a principled manner. We address these shortcomings with the development of the sparse Gaussian process variational autoencoder (SGP-VAE), characterised by the use of partial inference networks for parameterising sparse GP approximations. Leveraging the benefits of amortised variational inference, the SGP-VAE enables inference in multi-output sparse GPs on previously unobserved data with no additional training. The SGP-VAE is evaluated in a variety of experiments where it outperforms alternative approaches including multi-output GPs and structured VAEs.",
+    "title": "Sparse Gaussian Process Variational Autoencoders",
+    "authors": [
+      "Matthew Ashman",
+      "Jonathan So",
+      "William Tebbutt",
+      "Vincent Fortuin",
+      "Michael Arthur Leopold Pearce",
+      "Richard E Turner"
+    ],
+    "emails": [
+      "~William_Tebbutt1",
+      "~Jonathan_So1",
+      "~Michael_Arthur_Leopold_Pearce1",
+      "~Vincent_Fortuin1",
+      "~Richard_E_Turner1",
+      "~Matthew_Ashman1"
+    ],
+    "rank": 811
+  },
+  {
+    "url": "https://openreview.net/forum?id=RtNpzLdHUAW",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Current machine learning algorithms are designed to work with huge volumes of high dimensional data such as images. However, these algorithms are being increasingly deployed to resource constrained systems such as mobile devices and embedded systems. Even in cases where large computing infrastructure is available, the size of each data instance, as well as datasets, can provide a huge bottleneck in data transfer across communication channels. Also, there is a huge incentive both in energy and monetary terms in reducing both the computational and memory requirements of these algorithms. For non-parametric models that require to leverage the stored training data at the inference time, the increased cost in memory and computation could be even more problematic. In this work, we aim to reduce the volume of data these algorithms must process through an end-to-end two-stage neural subset selection model, where the first stage selects a set of candidate points using a conditionally independent Bernoulli mask followed by an iterative coreset selection via a conditional Categorical distribution. The subset selection model is trained by meta-learning with a distribution of sets. We validate our method on set reconstruction and classification tasks with feature selection as well as the selection of representative samples from a given dataset, on which our method outperforms relevant baselines. We also show in our experiments that our method enhances scalability of non-parametric models such as Neural Processes.",
+    "title": "Stochastic Subset Selection for Efficient Training and Inference of Neural Networks",
+    "authors": [
+      "Andreis Bruno",
+      "Tuan Nguyen",
+      "Juho Lee",
+      "Eunho Yang",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Juho_Lee2",
+      "~Andreis_Bruno1",
+      "~Eunho_Yang1",
+      "~Tuan_Nguyen3",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 812
+  },
+  {
+    "url": "https://openreview.net/forum?id=DILxQP08O3B",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Object goal navigation aims to steer an agent towards a target object based on observations of the agent. It is of pivotal importance to design effective visual representations of the observed scene in determining navigation actions.  In this paper, we introduce a Visual Transformer Network (VTNet) for learning informative visual representation in navigation.  VTNet is a highly effective structure that embodies two key properties for visual representations: First, the relationships among all the object instances in a scene are exploited; Second, the spatial locations of objects and image regions are emphasized so that directional navigation signals can be learned. Furthermore, we also develop a pre-training scheme to associate the visual representations with navigation signals, and thus facilitate navigation policy learning. In a nutshell, VTNet embeds object and region features with their location cues as spatial-aware descriptors and then incorporates all the encoded descriptors through attention operations to achieve informative representation for navigation. Given such visual representations, agents are able to explore the correlations between visual observations and navigation actions. For example, an agent would prioritize ``turning right'' over ``turning left'' when the visual representation emphasizes on the right side of activation map. Experiments in the artificial environment AI2-Thor demonstrate that VTNet significantly outperforms state-of-the-art methods in unseen testing environments.",
+    "title": "VTNet: Visual Transformer Network for Object Goal Navigation",
+    "authors": [
+      "Heming Du",
+      "Xin Yu",
+      "Liang Zheng"
+    ],
+    "emails": [
+      "~Xin_Yu1",
+      "~Liang_Zheng4",
+      "~Heming_Du2"
+    ],
+    "rank": 813
+  },
+  {
+    "url": "https://openreview.net/forum?id=H5B3lmpO1g",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "6D robotic grasping beyond top-down bin-picking scenarios is a challenging task. Previous solutions based on 6D grasp synthesis with robot motion planning usually operate in an open-loop setting without considering perception feedback and dynamics and contacts of objects, which makes them sensitive to grasp synthesis errors. In this work, we propose a novel method for learning closed-loop control policies for 6D robotic grasping using point clouds from an egocentric camera. We combine imitation learning and reinforcement learning in order to grasp unseen objects and handle the continuous 6D action space, where expert demonstrations are obtained from a joint motion and grasp planner. We introduce a goal-auxiliary actor-critic algorithm, which uses grasping goal prediction as an auxiliary task to facilitate policy learning. The supervision on grasping goals can be obtained from the expert planner for known objects or from hindsight goals for unknown objects. Overall, our learned closed-loop policy achieves over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>90</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> success rates on grasping various ShapeNet objects and YCB objects in simulation. The policy also transfers well to the real world with only one failure among grasping of ten different unseen objects in the presence of perception noises.",
+    "title": "Goal-Auxiliary Actor-Critic for 6D Robotic Grasping with Point Clouds",
+    "authors": [
+      "Lirui Wang",
+      "Yu Xiang",
+      "Dieter Fox"
+    ],
+    "emails": [
+      "~Yu_Xiang3",
+      "~Lirui_Wang1",
+      "~Dieter_Fox1"
+    ],
+    "rank": 814
+  },
+  {
+    "url": "https://openreview.net/forum?id=r28GdiQF7vM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      2,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Stochastic optimization has become the workhorse behind many successful machine learning applications, which motivates a lot of theoretical analysis to understand its empirical behavior. As a comparison, there is far less work to study the generalization behavior especially in a non-convex learning setting. In this paper, we study the generalization behavior of stochastic optimization by leveraging the algorithmic stability for learning with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-gradient-dominated objective functions. We develop generalization bounds of the order <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mo stretchy=\"false\">(</mo><mi>n</mi><mi>\u03b2</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> plus the convergence rate of the optimization algorithm, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the sample size. Our stability analysis significantly improves the existing non-convex analysis by removing the bounded gradient assumption and implying better generalization bounds. We achieve this improvement by exploiting the smoothness of loss functions instead of the Lipschitz condition in Charles &amp; Papailiopoulos (2018). We apply our general results to various stochastic optimization algorithms, which show clearly how the variance-reduction techniques improve not only training but also generalization. Furthermore, our discussion explains how interpolation helps generalization for highly expressive models.",
+    "title": "Sharper Generalization Bounds for Learning with Gradient-dominated Objective Functions",
+    "authors": [
+      "Yunwen Lei",
+      "Yiming Ying"
+    ],
+    "emails": [
+      "~Yunwen_Lei1",
+      "~Yiming_Ying1"
+    ],
+    "rank": 815
+  },
+  {
+    "url": "https://openreview.net/forum?id=_O9YLet0wvN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Despite substantial progress in object detection and few-shot learning, detecting objects based on a single example - one-shot object detection - remains a challenge. A central problem is the generalization gap: Object categories used during training are detected much more reliably than novel ones. We here show that this generalization gap can be nearly closed by increasing the number of object categories used during training. Doing so allows us to beat the state-of-the-art on COCO by 5.4 %AP50 (from 22.0 to 27.5) and improve generalization from seen to unseen classes from 45% to 89%. We verify that the effect is caused by the number of categories and not the amount of data and that it holds for different models, backbones and datasets. This result suggests that the key to strong few-shot detection models may not lie in sophisticated metric learning approaches, but instead simply in scaling the number of categories. We hope that our findings will help to better understand the challenges of few-shot learning and encourage future data annotation efforts to focus on wider datasets with a broader set of categories rather than gathering more samples per category.",
+    "title": "Closing the Generalization Gap in One-Shot Object Detection",
+    "authors": [
+      "Claudio Michaelis",
+      "Matthias Bethge",
+      "Alexander S Ecker"
+    ],
+    "emails": [
+      "~Claudio_Michaelis1",
+      "~Matthias_Bethge1",
+      "~Alexander_S_Ecker1"
+    ],
+    "rank": 816
+  },
+  {
+    "url": "https://openreview.net/forum?id=4AWko4A35ss",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper proposes a novel pretext task for self-supervised video representation learning by exploiting spatiotemporal continuity in videos. It is motivated by the fact that videos are spatiotemporal by nature and a representation learned to detect spatiotemporal continuity/discontinuity is thus beneficial for downstream video content analysis tasks. A natural choice of such a pretext task is to construct spatiotemporal (3D) jigsaw puzzles and learn to solve them. However, this task turns out to be intractable. We thus propose Constrained Spatiotemporal Jigsaw (CSJ) whereby the 3D jigsaws are formed in a constrained manner to ensure that large continuous spatiotemporal cuboids exist in a shuffled clip to provide sufficient cues for the model to reason about the continuity. With the constrained jigsaw puzzles, instead of solving them directly, which could still be extremely hard, we carefully design four surrogate tasks that are more solvable but meanwhile still ensure that the learned representation is sensitive to spatiotemporal continuity at both the local and global levels. Extensive experiments show that our CSJ achieves state-of-the-art on two downstream tasks across various benchmarks.",
+    "title": "Self-Supervised Video Representation Learning with Constrained Spatiotemporal Jigsaw",
+    "authors": [
+      "Yuqi Huo",
+      "Mingyu Ding",
+      "Haoyu Lu",
+      "Zhiwu Lu",
+      "Tao Xiang",
+      "Ji-Rong Wen",
+      "Ziyuan Huang",
+      "Jianwen Jiang",
+      "Shiwei Zhang",
+      "Mingqian Tang",
+      "Songfang Huang",
+      "Ping Luo"
+    ],
+    "emails": [
+      "~Songfang_Huang1",
+      "~Haoyu_Lu1",
+      "~Jianwen_Jiang2",
+      "~Yuqi_Huo1",
+      "~Zhiwu_Lu1",
+      "~Ji-Rong_Wen1",
+      "~Ping_Luo2",
+      "~Mingyu_Ding1",
+      "~Ziyuan_Huang1",
+      "~Mingqian_Tang1",
+      "~Shiwei_Zhang2",
+      "~Tao_Xiang1"
+    ],
+    "rank": 817
+  },
+  {
+    "url": "https://openreview.net/forum?id=2d34y5bRWxB",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "The regularization of prediction models is arguably the most crucial ingredient that allows Machine Learning solutions to generalize well on unseen data. Several types of regularization are popular in the Deep Learning community (e.g., weight decay, drop-out, early stopping, etc.), but so far these are selected on an ad-hoc basis, and there is no systematic study as to how different regularizers should be combined into the best \u201ccocktail\u201d. In this paper, we fill this gap, by considering the cocktails of 13 different regularization methods and framing the question of how to best combine them as a standard hyperparameter optimization problem. We perform a large-scale empirical study on 40 tabular datasets, concluding that, firstly, regularization cocktails substantially outperform individual regularization methods, even if the hyperparameters of the latter are carefully tuned; secondly, the optimal regularization cocktail depends on the dataset; and thirdly, regularization cocktails yield the state-of-the-art in classifying tabular datasets by outperforming Gradient-Boosted Decision Trees.",
+    "title": "Regularization Cocktails for Tabular Datasets",
+    "authors": [
+      "Arlind Kadra",
+      "Marius Lindauer",
+      "Frank Hutter",
+      "Josif Grabocka"
+    ],
+    "emails": [
+      "~Josif_Grabocka1",
+      "~Arlind_Kadra1",
+      "~Frank_Hutter1",
+      "~Marius_Lindauer1"
+    ],
+    "rank": 818
+  },
+  {
+    "url": "https://openreview.net/forum?id=GXJPLbB5P-y",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "We focus on prediction problems with high-dimensional outputs that are subject to output validity constraints, e.g. a pseudocode-to-code translation task where the code must compile. For these problems, labeled input-output pairs are expensive to obtain, but \"unlabeled\" outputs, i.e. outputs without corresponding inputs, are freely available and provide information about output validity (e.g. code on GitHub). In this paper, we present predict-and-denoise, a framework that can leverage unlabeled outputs. Specifically, we first train a denoiser to map possibly invalid outputs to valid outputs using synthetic perturbations of the unlabeled outputs. Second, we train a predictor composed with this fixed denoiser. We show theoretically that for a family of functions with a high-dimensional discrete valid output space, composing with a denoiser reduces the complexity of a 2-layer ReLU network needed to represent the function and that this complexity gap can be arbitrarily large. We evaluate the framework empirically on several datasets, including image generation from attributes and pseudocode-to-code translation. On the SPoC pseudocode-to-code dataset, our framework improves the proportion of code outputs that pass all test cases by 3-5% over a baseline Transformer.",
+    "title": "Simplifying Models with Unlabeled Output Data",
+    "authors": [
+      "Sang Michael Xie",
+      "Tengyu Ma",
+      "Percy Liang"
+    ],
+    "emails": [
+      "~Sang_Michael_Xie1",
+      "~Percy_Liang1",
+      "~Tengyu_Ma1"
+    ],
+    "rank": 819
+  },
+  {
+    "url": "https://openreview.net/forum?id=Iz3zU3M316D",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Normalization techniques, such as batch normalization (BN), are a boon for modern deep learning. They let weights converge more quickly with often better generalization performances. It has been argued that the normalization-induced scale invariance among the weights provides an advantageous ground for gradient descent (GD) optimizers: the effective step sizes are automatically reduced over time, stabilizing the overall training procedure. It is often overlooked, however, that the additional introduction of momentum in GD optimizers results in a far more rapid reduction in effective step sizes for scale-invariant weights, a phenomenon that has not yet been studied and may have caused unwanted side effects in the current practice. This is a crucial issue because arguably the vast majority of modern deep neural networks consist of (1) momentum-based GD (e.g. SGD or Adam) and (2) scale-invariant parameters (e.g. more than 90% of the weights in ResNet are scale-invariant due to BN). In this paper, we verify that the widely-adopted combination of the two ingredients lead to the premature decay of effective step sizes and sub-optimal model performances. We propose a simple and effective remedy, SGDP and AdamP: get rid of the radial component, or the norm-increasing direction, at each optimizer step. Because of the scale invariance, this modification only alters the effective step sizes without changing the effective update directions, thus enjoying the original convergence properties of GD optimizers. Given the ubiquity of momentum GD and scale invariance in machine learning, we have evaluated our methods against the baselines on 13 benchmarks. They range from vision tasks like classification (e.g. ImageNet), retrieval (e.g. CUB and SOP), and detection (e.g. COCO) to language modelling (e.g. WikiText) and audio classification (e.g. DCASE) tasks. We verify that our solution brings about uniform gains in performances in those benchmarks. Source code is available at <a href=\"https://github.com/clovaai/adamp\" target=\"_blank\" rel=\"nofollow\">https://github.com/clovaai/adamp</a>",
+    "title": "AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights",
+    "authors": [
+      "Byeongho Heo",
+      "Sanghyuk Chun",
+      "Seong Joon Oh",
+      "Dongyoon Han",
+      "Sangdoo Yun",
+      "Gyuwan Kim",
+      "Youngjung Uh",
+      "Jung-Woo Ha"
+    ],
+    "emails": [
+      "~Sangdoo_Yun1",
+      "~Byeongho_Heo1",
+      "~Sanghyuk_Chun1",
+      "~Gyuwan_Kim1",
+      "~Jung-Woo_Ha1",
+      "~Dongyoon_Han1",
+      "~Youngjung_Uh2",
+      "~Seong_Joon_Oh1"
+    ],
+    "rank": 820
+  },
+  {
+    "url": "https://openreview.net/forum?id=KYPz4YsCPj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Temporal networks serve as abstractions of many real-world dynamic systems. These networks typically evolve according to certain laws, such as the law of triadic closure, which is universal in social networks. Inductive representation learning of temporal networks should be able to capture such laws and further be applied to systems that follow the same laws but have not been unseen during the training stage. Previous works in this area depend on either network node identities or rich edge attributes and typically fail to extract these laws. Here, we propose {\\em Causal Anonymous Walks (CAWs)} to inductively represent a temporal network. CAWs are extracted by temporal random walks and work as automatic retrieval of temporal network motifs to represent network dynamics while avoiding the time-consuming selection and counting of those motifs. CAWs adopt a novel anonymization strategy that replaces node identities with the hitting counts of the nodes based on a set of sampled walks to keep the method inductive, and simultaneously establish the correlation between motifs. We further propose a neural-network model CAW-N to encode CAWs, and pair it with a CAW sampling strategy with constant memory and time cost to support online training and inference. CAW-N is evaluated to predict links over 6 real temporal networks and uniformly outperforms previous SOTA methods by averaged 15\\% AUC gain in the inductive setting. CAW-N also outperforms previous methods in 5 out of the 6 networks in the transductive setting.",
+    "title": "Inductive Representation Learning in Temporal Networks via Causal Anonymous Walks",
+    "authors": [
+      "Yanbang Wang",
+      "Yen-Yu Chang",
+      "Yunyu Liu",
+      "Jure Leskovec",
+      "Pan Li"
+    ],
+    "emails": [
+      "~Yunyu_Liu1",
+      "stanford.edu",
+      "~Jure_Leskovec1",
+      "~Yanbang_Wang1",
+      "~Pan_Li2"
+    ],
+    "rank": 821
+  },
+  {
+    "url": "https://openreview.net/forum?id=NMgB4CVnMh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper proposes a novel acoustic word embedding called Acoustic Neighbor Embeddings where speech or text of arbitrary length are mapped to a vector space of fixed, reduced dimensions by adapting stochastic neighbor embedding (SNE) to sequential inputs. The Euclidean distance between coordinates in the embedding space reflects the phonetic confusability between their corresponding sequences. Two encoder neural networks are trained: an acoustic encoder that accepts speech signals in the form of frame-wise subword posterior probabilities obtained from an acoustic model and a text encoder that accepts text in the form of subword transcriptions. Compared to a triplet loss criterion, the proposed method is shown to have more effective gradients for neural network training. Experimentally, it also gives more accurate results with low-dimensional embeddings when the two encoder networks are used in tandem in a word (name) recognition task, and when the text encoder network is used standalone in an approximate phonetic matching task. In particular, in an isolated name recognition task depending solely on Euclidean nearest-neighbor search between the proposed embedding vectors, the recognition accuracy is identical to that of conventional finite state transducer(FST)-based decoding using test data with up to 1 million names in the vocabulary and 40 dimensions in the embeddings.",
+    "title": "Acoustic Neighbor Embeddings",
+    "authors": [
+      "Woojay Jeon"
+    ],
+    "emails": [
+      "~Woojay_Jeon1"
+    ],
+    "rank": 822
+  },
+  {
+    "url": "https://openreview.net/forum?id=aGmEDl1NWJ-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      8
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "The growing interest for adversarial examples, i.e. maliciously modified examples which fool a classifier, has resulted in many defenses intended to detect them, render them inoffensive or make the model more robust against them. In this paper, we pave the way towards a new approach to improve the robustness of a model against black-box transfer attacks. A removable additional neural network is included in the target model and is designed to induce the \"luring effect\", which tricks the adversary into choosing false directions to fool the target model. Training the additional model is achieved thanks to a loss function acting on the logits sequence order. Our deception-based method only needs to have access to the predictions of the target model and does not require a labeled data set. We explain the luring effect thanks to the notion of robust and non-robust useful features and perform experiments on MNIST, SVHN and CIFAR10 to characterize and evaluate this phenomenon. Additionally, we discuss two simple prediction schemes, and verify experimentally that our approach can be used as a defense to efficiently thwart an adversary using state-of-the-art attacks and allowed to perform large perturbations.",
+    "title": "Luring of transferable adversarial perturbations in the black-box paradigm",
+    "authors": [
+      "R\u00e9mi Bernhard",
+      "Pierre-Alain Mo\u00ebllic",
+      "Jean-Max Dutertre"
+    ],
+    "emails": [
+      "~Pierre-Alain_Mo\u00ebllic1",
+      "~R\u00e9mi_Bernhard1",
+      "~Jean-Max_Dutertre1"
+    ],
+    "rank": 823
+  },
+  {
+    "url": "https://openreview.net/forum?id=vhKe9UFbrJo",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Multimodal learning for generative models often refers to the learning of abstract concepts from the commonality of information in multiple modalities, such as vision and language. While it has proven effective for learning generalisable representations, the training of such models often requires a large amount of related multimodal data that shares commonality, which can be expensive to come by. To mitigate this, we develop a novel contrastive framework for generative model learning, allowing us to train the model not just by the commonality between modalities, but by the distinction between \"related\" and \"unrelated\" multimodal data. We show in experiments that our method enables data-efficient multimodal learning on challenging datasets for various multimodal VAE models. We also show that under our proposed framework, the generative model can accurately identify related samples from unrelated ones, making it possible to make use of the plentiful unlabeled, unpaired multimodal data.",
+    "title": "Relating by Contrasting: A Data-efficient Framework for Multimodal Generative Models",
+    "authors": [
+      "Yuge Shi",
+      "Brooks Paige",
+      "Philip Torr",
+      "Siddharth N"
+    ],
+    "emails": [
+      "~Philip_Torr1",
+      "~Brooks_Paige1",
+      "~Yuge_Shi1",
+      "~Siddharth_N1"
+    ],
+    "rank": 824
+  },
+  {
+    "url": "https://openreview.net/forum?id=7_G8JySGecm",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Interactive Fiction (IF) games provide a useful testbed for language-based reinforcement learning agents, posing significant challenges of natural language understanding, commonsense reasoning, and non-myopic planning in the combinatorial search space. Agents based on standard planning algorithms struggle to play IF games due to the massive search space of language actions. Thus, language-grounded planning is a key ability of such agents, since inferring the consequence of language action based on semantic understanding can drastically improve search. In this paper, we introduce Monte-Carlo planning with Language Action Value Estimates (MC-LAVE) that combines a Monte-Carlo tree search with language-driven exploration. MC-LAVE invests more search effort into semantically promising language actions using locally optimistic language value estimates, yielding a significant reduction in the effective search space of language actions. We then present a reinforcement learning approach via MC-LAVE, which alternates between MC-LAVE planning and supervised learning of the self-generated language actions. In the experiments, we demonstrate that our method achieves new high scores in various IF games.",
+    "title": "Monte-Carlo Planning and Learning with Language Action Value Estimates",
+    "authors": [
+      "Youngsoo Jang",
+      "Seokin Seo",
+      "Jongmin Lee",
+      "Kee-Eung Kim"
+    ],
+    "emails": [
+      "~Jongmin_Lee1",
+      "~Youngsoo_Jang2",
+      "~Seokin_Seo1",
+      "~Kee-Eung_Kim4"
+    ],
+    "rank": 825
+  },
+  {
+    "url": "https://openreview.net/forum?id=HIGSa_3kOx3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "The objective of \\textit{lifelong} reinforcement learning (RL) is to optimize agents which can continuously adapt and interact in changing environments. However, current RL approaches fail drastically when environments are non-stationary and interactions are non-episodic. We propose \\textit{Lifelong Skill Planning} (LiSP), an algorithmic framework for lifelong RL based on planning in an abstract space of higher-order skills. We learn the skills in an unsupervised manner using intrinsic rewards and plan over the learned skills using a learned dynamics model. Moreover, our framework permits skill discovery even from offline data, thereby reducing the need for excessive real-world interactions. We demonstrate empirically that LiSP successfully enables long-horizon planning and learns agents that can avoid catastrophic failures even in challenging non-stationary and non-episodic environments derived from gridworld and MuJoCo benchmarks.",
+    "title": "Reset-Free Lifelong Learning with Skill-Space Planning",
+    "authors": [
+      "Kevin Lu",
+      "Aditya Grover",
+      "Pieter Abbeel",
+      "Igor Mordatch"
+    ],
+    "emails": [
+      "~Aditya_Grover1",
+      "~Kevin_Lu2",
+      "~Pieter_Abbeel2",
+      "~Igor_Mordatch4"
+    ],
+    "rank": 826
+  },
+  {
+    "url": "https://openreview.net/forum?id=lJuOUWlAC8i",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Recently, neural-symbolic architectures have achieved success on commonsense reasoning through effectively encoding relational structures retrieved from external knowledge graphs (KGs) and obtained state-of-the-art results in tasks such as (commonsense) question answering and natural language inference. However, current neural-symbolic reasoning methods rely on quality and contextualized knowledge structures (i.e., fact triples) that can be retrieved at the pre-processing stage and overlook challenges such as dealing with incompleteness of a KG (low coverage), limited expressiveness of its relations, and irrelevant retrieved facts in the reasoning context. \nIn this paper, we present a novel neural-symbolic approach, named Hybrid Graph Network (HGN), which jointly generates feature representations for new triples (as complement to the existing edges in the KG), determines relevance of the triples to the reasoning context, and learns graph model parameters for encoding the relational information. Our method learns a compact graph structure (comprising both retrieved and generated edges) through filtering edges that are unhelpful to the reasoning process. We show marked improvements on three commonsense reasoning benchmarks and demonstrate the superiority of the learned graph structures with user studies.",
+    "title": "Learning Contextualized Knowledge Graph Structures for Commonsense Reasoning",
+    "authors": [
+      "Jun Yan",
+      "Mrigank Raman",
+      "Tianyu Zhang",
+      "Ryan Rossi",
+      "Handong Zhao",
+      "Sungchul Kim",
+      "Nedim Lipka",
+      "Xiang Ren"
+    ],
+    "emails": [
+      "~Jun_Yan5",
+      "mails.tsinghua.edu.cn",
+      "iitd.ac.in",
+      "~Handong_Zhao3",
+      "adobe.com",
+      "~Xiang_Ren1",
+      "~Ryan_Rossi1"
+    ],
+    "rank": 827
+  },
+  {
+    "url": "https://openreview.net/forum?id=l-LGlk4Yl6G",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Motivated by metagenomics, recommender systems, dictionary learning, and related problems, this paper introduces subspace splitting(SS): the task of clustering the entries of what we call amixed-features vector, that is, a vector whose subsets of coordinates agree with a collection of subspaces. We derive precise identifiability conditions under which SS is well-posed, thus providing the first fundamental theory for this problem. We also propose the first three practical SS algorithms, each with advantages and disadvantages: a random sampling method , a projection-based greedy heuristic , and an alternating Lloyd-type algorithm ; all allow noise, outliers, and missing data. Our extensive experiments outline the performance of our algorithms, and in lack of other SS algorithms, for reference we compare against methods for tightly related problems, like robust matched subspace detection and maximum feasible subsystem, which are special simpler cases of SS.",
+    "title": "Mixed-Features Vectors and Subspace Splitting",
+    "authors": [
+      "Alejandro Pimentel-Alarc\u00f3n",
+      "Daniel L. Pimentel-Alarc\u00f3n"
+    ],
+    "emails": [
+      "~Alejandro_Pimentel-Alarc\u00f3n2",
+      "~Daniel_L._Pimentel-Alarc\u00f3n1"
+    ],
+    "rank": 828
+  },
+  {
+    "url": "https://openreview.net/forum?id=o81ZyBCojoA",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Model-agnostic meta-learning (MAML) has emerged as one of the most successful meta-learning techniques in few-shot learning. It enables us to learn a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">meta-initialization</mtext></math></mjx-assistive-mml></mjx-container> of model parameters (that we call <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">meta-model</mtext></math></mjx-assistive-mml></mjx-container>) to rapidly adapt to new tasks using a small amount of labeled training data. Despite the generalization power of the meta-model, it remains elusive that how <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">adversarial robustness</mtext></math></mjx-assistive-mml></mjx-container> can be maintained by MAML in few-shot learning. In addition to generalization, robustness is also desired for a meta-model to defend adversarial examples (attacks). Toward promoting adversarial robustness in MAML, we first study <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">when</mtext></math></mjx-assistive-mml></mjx-container> a robustness-promoting regularization should be incorporated, given the fact that MAML adopts a bi-level (fine-tuning vs. meta-update) learning procedure. We show that robustifying the meta-update stage is sufficient to make robustness adapted to the task-specific fine-tuning stage even if the latter uses a standard training protocol. We also make additional justification on the acquired robustness adaptation by peering into the interpretability of neurons' activation maps. Furthermore, we investigate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">how</mtext></math></mjx-assistive-mml></mjx-container> robust regularization can <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">efficiently</mtext></math></mjx-assistive-mml></mjx-container> be designed in MAML. We propose a general but easily-optimized robustness-regularized meta-learning framework, which allows the use of unlabeled data augmentation, fast adversarial attack generation, and computationally-light fine-tuning. In particular, we for the first time show that the auxiliary contrastive learning task can enhance the adversarial robustness of MAML. Finally, extensive experiments are conducted to demonstrate the effectiveness of our proposed methods in robust few-shot learning.",
+    "title": "On Fast Adversarial Robustness Adaptation in Model-Agnostic Meta-Learning",
+    "authors": [
+      "Ren Wang",
+      "Kaidi Xu",
+      "Sijia Liu",
+      "Pin-Yu Chen",
+      "Tsui-Wei Weng",
+      "Chuang Gan",
+      "Meng Wang"
+    ],
+    "emails": [
+      "~Ren_Wang1",
+      "~Meng_Wang4",
+      "~Kaidi_Xu1",
+      "~Pin-Yu_Chen1",
+      "~Chuang_Gan1",
+      "~Tsui-Wei_Weng1",
+      "~Sijia_Liu1"
+    ],
+    "rank": 829
+  },
+  {
+    "url": "https://openreview.net/forum?id=3RLN4EPMdYd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Learning to predict the long-term future of video frames is notoriously challenging due to the inherent ambiguities in a distant future and dramatic amplification of prediction error over time. Despite the recent advances in the literature, existing approaches are limited to moderately short-term prediction (less than a few seconds), while extrapolating it to a longer future quickly leads to destruction in structure and content. In this work, we revisit the hierarchical models in video prediction. Our method generates future frames by first estimating a sequence of dense semantic structures and subsequently translating the estimated structures to pixels by video-to-video translation model. Despite the simplicity, we show that modeling structures and their dynamics in categorical structure space with stochastic sequential estimator leads to surprisingly successful long-term prediction. We evaluate our method on two challenging video prediction scenarios, \\emph{car driving} and \\emph{human dancing}, and demonstrate that it can generate complicated scene structures and motions over a very long time horizon (\\ie~thousands frames), setting a new standard of video prediction with orders of magnitude longer prediction time than existing approaches. Video results are available at <a href=\"https://1konny.github.io/HVP/\" target=\"_blank\" rel=\"nofollow\">https://1konny.github.io/HVP/</a>.",
+    "title": "Revisiting Hierarchical Approach for Persistent Long-Term Video Prediction",
+    "authors": [
+      "Wonkwang Lee",
+      "Whie Jung",
+      "Han Zhang",
+      "Ting Chen",
+      "Jing Yu Koh",
+      "Thomas Huang",
+      "Hyungsuk Yoon",
+      "Honglak Lee",
+      "Seunghoon Hong"
+    ],
+    "emails": [
+      "~Whie_Jung1",
+      "~Seunghoon_Hong2",
+      "~Jing_Yu_Koh2",
+      "~Ting_Chen1",
+      "~Wonkwang_Lee2",
+      "~Honglak_Lee2",
+      "~Hyungsuk_Yoon2",
+      "umich.edu",
+      "~Han_Zhang1"
+    ],
+    "rank": 830
+  },
+  {
+    "url": "https://openreview.net/forum?id=gGDlZrfFq9d",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Standard supervised learning breaks down under data distribution shift. However, the principle of independent causal mechanisms (ICM, Peters et al. (2017)) can turn this weakness into an opportunity: one can take advantage of distribution shift between different environments during training in order to obtain more robust models. We propose a new gradient-based learning framework whose objective function is derived from the ICM principle. We show theoretically and experimentally that neural networks trained in this framework focus on relations remaining invariant across environments and ignore unstable ones. Moreover, we prove that the recovered stable relations correspond to the true causal mechanisms under certain conditions. In both regression and classification, the resulting models generalize well to unseen scenarios where traditionally trained models fail.",
+    "title": "Learning Robust Models using the Principle of Independent Causal Mechanisms",
+    "authors": [
+      "Jens M\u00fcller",
+      "Robert Schmier",
+      "Lynton Ardizzone",
+      "Carsten Rother",
+      "Ullrich Koethe"
+    ],
+    "emails": [
+      "~Ullrich_Koethe1",
+      "~Carsten_Rother1",
+      "~Robert_Schmier1",
+      "~Jens_M\u00fcller2",
+      "~Lynton_Ardizzone1"
+    ],
+    "rank": 831
+  },
+  {
+    "url": "https://openreview.net/forum?id=0NQdxInFWT_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Subsampling a signal of interest can reduce costly data transfer, battery drain, radiation exposure and acquisition time in a wide range of problems. The recently proposed Deep Probabilistic Subsampling (DPS) method effectively integrates subsampling in an end-to-end deep learning model, but learns a static pattern for all datapoints. We generalize DPS to a sequential method that actively picks the next sample based on the information acquired so far; dubbed Active-DPS (A-DPS). We validate that A-DPS  improves over DPS for MNIST classification at high subsampling rates. We observe that A-DPS learns to actively adapt based on the previously sampled elements, yielding different sampling sequences across the dataset. Moreover, we demonstrate strong performance in active acquisition Magnetic Resonance Image (MRI) reconstruction, outperforming DPS and other deep learning methods.",
+    "title": "Active Deep Probabilistic Subsampling",
+    "authors": [
+      "Hans van Gorp",
+      "Iris A.M. Huijben",
+      "Bastiaan S. Veeling",
+      "Nicola Pezzotti",
+      "Ruud Van Sloun"
+    ],
+    "emails": [
+      "~Iris_A.M._Huijben1",
+      "~Ruud_Van_Sloun1",
+      "~Bastiaan_S._Veeling1",
+      "~Nicola_Pezzotti2",
+      "~Hans_van_Gorp1"
+    ],
+    "rank": 832
+  },
+  {
+    "url": "https://openreview.net/forum?id=o2ko2D_uvXJ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "Despite the success of deep learning in domains such as image, voice, and graphs, there has been little progress in deep representation learning for domains without a known structure between features. For instance, a tabular dataset of different demographic and clinical factors where the feature interactions are not given as a prior. In this paper, we propose Group-Connected Multilayer Perceptron (GMLP) networks to enable deep representation learning in these domains. GMLP is based on the idea of learning expressive feature combinations (groups) and exploiting them to reduce the network complexity by defining local group-wise operations. During the training phase, GMLP learns a sparse feature grouping matrix using temperature annealing softmax with an added entropy loss term to encourage the sparsity. Furthermore, an architecture is suggested which resembles binary trees, where group-wise operations are followed by pooling operations to combine information; reducing the number of groups as the network grows in depth. To evaluate the proposed method, we conducted experiments on different real-world datasets covering various application areas. Additionally, we provide visualizations on MNIST and synthesized data. According to the results, GMLP is able to successfully learn and exploit expressive feature combinations and achieve state-of-the-art classification performance on different datasets.",
+    "title": "Group-Connected Multilayer Perceptron Networks",
+    "authors": [
+      "Mohammad Kachuee",
+      "Sajad Darabi",
+      "Shayan Fazeli",
+      "Majid Sarrafzadeh"
+    ],
+    "emails": [
+      "~Majid_Sarrafzadeh1",
+      "~Mohammad_Kachuee1",
+      "cs.ucla.edu",
+      "~Sajad_Darabi1"
+    ],
+    "rank": 833
+  },
+  {
+    "url": "https://openreview.net/forum?id=CBmJwzneppz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We design a new provably efficient algorithm for episodic reinforcement learning with generalized linear function approximation. We analyze the algorithm under a new expressivity assumption that we call ``optimistic closure,'' which is strictly weaker than assumptions from prior analyses for the linear setting. With optimistic closure, we prove that our algorithm enjoys a regret bound of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.187em; margin-bottom: -0.597em;\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2DC TEX-S1\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mrow><mjx-mo class=\"mjx-lop\"><mjx-c class=\"mjx-c28 TEX-S2\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.123em;\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-lop\"><mjx-c class=\"mjx-c29 TEX-S2\"></mjx-c></mjx-mo></mjx-mrow></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo>~</mo></mover></mrow><mrow data-mjx-texclass=\"INNER\"><mo data-mjx-texclass=\"OPEN\">(</mo><mi>H</mi><msqrt><msup><mi>d</mi><mn>3</mn></msup><mi>T</mi></msqrt><mo data-mjx-texclass=\"CLOSE\">)</mo></mrow></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> is the horizon, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> is the dimensionality of the state-action features and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the number of episodes. This is the first statistically and computationally efficient algorithm for reinforcement learning with generalized linear functions.",
+    "title": "Optimism in Reinforcement Learning with Generalized Linear Function Approximation",
+    "authors": [
+      "Yining Wang",
+      "Ruosong Wang",
+      "Simon Shaolei Du",
+      "Akshay Krishnamurthy"
+    ],
+    "emails": [
+      "~Yining_Wang1",
+      "~Simon_Shaolei_Du1",
+      "~Akshay_Krishnamurthy1",
+      "~Ruosong_Wang1"
+    ],
+    "rank": 834
+  },
+  {
+    "url": "https://openreview.net/forum?id=L5b6jUonKFB",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "CNNs and computational models of biological vision share some fundamental principles, which, combined with recent developments in deep learning, have opened up new avenues of research in neuroscience. However, in contrast to biological models, conventional CNN architectures are based on spatio-temporally discrete representations, and thus cannot accommodate certain aspects of biological complexity such as continuously varying receptive field sizes and temporal dynamics of neuronal responses. Here we propose deep continuous networks (DCNs), which combine spatially continuous convolutional filter representations, with the continuous time framework of neural ODEs. This allows us to learn the spatial support of the filters during training, as well as model the temporal evolution of feature maps, linking DCNs closely to biological models. We show that DCNs are versatile. Experimentally, we demonstrate their applicability to a standard classification problem, where they allow for parameter reductions and meta-parametrization. We illustrate the biological plausibility of the scale distributions learned by DCNs and explore their performance in a pattern completion task, which is inspired by models from computational neuroscience. Finally, we suggest that the continuous representations learned by DCNs may enable computationally efficient implementations.",
+    "title": "Deep Continuous Networks",
+    "authors": [
+      "Nergis Tomen",
+      "Silvia Laura Pintea",
+      "Jan van Gemert"
+    ],
+    "emails": [
+      "~Jan_van_Gemert1",
+      "~Nergis_Tomen1",
+      "~Silvia_Laura_Pintea1"
+    ],
+    "rank": 835
+  },
+  {
+    "url": "https://openreview.net/forum?id=eJIJF3-LoZO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Developing algorithms that are able to generalize to a novel task given only a few labeled examples represents a fundamental challenge in closing the gap between machine- and human-level performance. The core of human cognition lies in the structured, reusable concepts that help us to rapidly adapt to new tasks and provide reasoning behind our decisions. However, existing meta-learning methods learn complex representations across prior labeled tasks without imposing any structure on the learned representations. Here we propose COMET, a meta-learning method that improves generalization ability by learning to learn along human-interpretable concept dimensions. Instead of learning a joint unstructured metric space, COMET learns mappings of high-level concepts into semi-structured metric spaces, and effectively combines the outputs of independent concept learners. We evaluate our model on few-shot tasks from diverse domains, including fine-grained image classification, document categorization  and cell type annotation on a novel dataset from a biological domain developed in our work. COMET significantly outperforms strong meta-learning baselines, achieving 6-15% relative improvement on the most challenging 1-shot learning tasks, while unlike existing methods providing interpretations behind the model's predictions.",
+    "title": "Concept Learners for Few-Shot Learning",
+    "authors": [
+      "Kaidi Cao",
+      "Maria Brbic",
+      "Jure Leskovec"
+    ],
+    "emails": [
+      "~Maria_Brbic1",
+      "~Jure_Leskovec1",
+      "~Kaidi_Cao1"
+    ],
+    "rank": 836
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q1jmmQz72M2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      5,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "    Neural Ordinary Differential Equations (NODEs), a framework of continuous-depth neural networks, have been widely applied, showing exceptional efficacy in coping with some representative datasets.  Recently, an augmented framework has been successfully developed for conquering some limitations emergent in application of the original framework.  Here we propose a new class of continuous-depth neural networks with delay, named as Neural Delay Differential Equations (NDDEs),  and, for computing the corresponding gradients, we use the adjoint sensitivity method to obtain the delayed dynamics of the adjoint. Since the differential equations with delays are usually seen as dynamical systems of infinite dimension possessing more fruitful dynamics, the NDDEs, compared to the NODEs, own a stronger capacity of nonlinear representations.  Indeed, we analytically validate that the NDDEs are of universal approximators, and further articulate an extension of the NDDEs, where the initial function of the NDDEs is supposed to satisfy ODEs.   More importantly, we use several illustrative examples to demonstrate the outstanding capacities of the NDDEs and the NDDEs with ODEs' initial value.  More precisely, (1) we successfully model the delayed dynamics where the trajectories in the lower-dimensional phase space could be mutually intersected, while the traditional NODEs without any argumentation are not directly applicable for such modeling, and (2) we achieve lower loss and higher accuracy not only for the  data produced synthetically by complex models but also for the real-world image datasets, i.e., CIFAR10, MNIST and SVHN.   Our results on the NDDEs reveal that appropriately articulating the elements of dynamical systems into the network design is truly beneficial to promoting the network performance.",
+    "title": "Neural Delay Differential Equations",
+    "authors": [
+      "Qunxi Zhu",
+      "Yao Guo",
+      "Wei Lin"
+    ],
+    "emails": [
+      "~Yao_Guo3",
+      "~Qunxi_Zhu1",
+      "~Wei_Lin1"
+    ],
+    "rank": 837
+  },
+  {
+    "url": "https://openreview.net/forum?id=4rsTcjH7co",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "One of the fascinating properties of deep learning is the ability of the network to reveal the underlying factors characterizing elements in datasets of different types. Autoencoders represent an effective approach for computing these factors. Autoencoders have been studied in the context of enabling interpolation between data points by decoding convex combinations of latent vectors. However, this interpolation often leads to artifacts or produces unrealistic results during reconstruction. We argue that these incongruities are due to the structure of the latent space and to the fact that such naively interpolated latent vectors deviate from the data manifold. In this paper, we propose a regularization technique that shapes the latent representation to follow a manifold that is consistent with the training images and that forces the manifold to be smooth and locally convex. This regularization not only enables faithful interpolation between data points, as we show herein, but can also be used as a general regularization technique to avoid overfitting or to produce new samples for data augmentation",
+    "title": "Autoencoder Image Interpolation by Shaping the Latent Space",
+    "authors": [
+      "Alon Oring",
+      "Zohar Yakhini",
+      "Yacov Hel-Or"
+    ],
+    "emails": [
+      "~Alon_Oring1",
+      "~Yacov_Hel-Or1",
+      "gmail.com"
+    ],
+    "rank": 838
+  },
+  {
+    "url": "https://openreview.net/forum?id=7apQQsbahFz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "A hallmark of an AI agent is to mimic human beings to understand and interact with others. In this paper, we propose a \\emph{collaborative} multi-agent reinforcement learning algorithm to learn a \\emph{joint} policy through the interactions over agents. To make a joint decision over the group,  each agent makes an initial decision and tells its  policy to its neighbors. Then each agent modifies its own policy properly based on received messages and spreads out its plan. As this intention propagation procedure goes on, we prove that it converges to a mean-field approximation of the joint policy with the framework of neural embedded probabilistic inference. We evaluate our algorithm on several large scale challenging tasks and demonstrate that it outperforms previous state-of-the-arts.",
+    "title": "Intention Propagation for  Multi-agent Reinforcement Learning",
+    "authors": [
+      "Chao Qu",
+      "Hui Li",
+      "Chang Liu",
+      "Junwu Xiong",
+      "james zhang",
+      "Wei Chu",
+      "Weiqiang Wang",
+      "Yuan Qi",
+      "Le Song"
+    ],
+    "emails": [
+      "~Chao_Qu2",
+      "~Weiqiang_Wang4",
+      "~Le_Song1",
+      "antgroup.com",
+      "~Chang_Liu7",
+      "gmail.com",
+      "~Wei_Chu1",
+      "~Hui_Li2"
+    ],
+    "rank": 839
+  },
+  {
+    "url": "https://openreview.net/forum?id=guEuB3FPcd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      2
+    ],
+    "abstract": "Neural networks have historically been built layerwise from the set of functions in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3A\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi>f</mi><mo>:</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup><mo accent=\"false\" stretchy=\"false\">\u2192</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>m</mi></msup></mrow></math></mjx-assistive-mml></mjx-container>, i.e. with activations and weights/parameters represented by real numbers, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"double-struck\">R</mi></mrow></math></mjx-assistive-mml></mjx-container>. Our work considers a richer set of objects for activations and weights, and undertakes a comprehensive study of alternative algebras as number representations by studying their performance on two challenging problems: large-scale image classification using the ImageNet dataset and language modeling using the enwiki8 and WikiText-103 datasets. We denote this broader class of models as AlgebraNets. Our findings indicate that the conclusions of prior work, which explored neural networks constructed from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c2102 TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"double-struck\">C</mi></mrow></math></mjx-assistive-mml></mjx-container> (complex numbers) and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c210D TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"double-struck\">H</mi></mrow></math></mjx-assistive-mml></mjx-container> (quaternions) on smaller datasets, do not always transfer to these challenging settings.  However, our results demonstrate that there are alternative algebras which deliver better parameter and computational efficiency compared with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"double-struck\">R</mi></mrow></math></mjx-assistive-mml></mjx-container>. We consider <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c2102 TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"double-struck\">C</mi></mrow></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c210D TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"double-struck\">H</mi></mrow></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>M</mi><mrow><mn>2</mn></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> (the set of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2</mn><mo>\u00d7</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container> real-valued matrices), <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c2102 TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>M</mi><mrow><mn>2</mn></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"double-struck\">C</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>M</mi><mrow><mn>3</mn></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>M</mi><mrow><mn>4</mn></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, dual numbers and the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mn>3</mn></msup></math></mjx-assistive-mml></mjx-container> cross product.  Additionally, we note that multiplication in these algebras has higher compute density than real multiplication, a useful property in situations with inherently limited parameter reuse such as auto-regressive inference and sparse neural networks. We therefore investigate how to induce sparsity within AlgebraNets. We hope that our strong results on large-scale, practical benchmarks will spur further exploration of these unconventional architectures which challenge the default choice of using real numbers for neural network weights and activations.",
+    "title": "AlgebraNets",
+    "authors": [
+      "Jordan Hoffmann",
+      "Simon Schmitt",
+      "Simon Osindero",
+      "Karen Simonyan",
+      "Erich Elsen"
+    ],
+    "emails": [
+      "~Simon_Osindero1",
+      "~Jordan_Hoffmann1",
+      "~Erich_Elsen1",
+      "~Simon_Schmitt1",
+      "~Karen_Simonyan1"
+    ],
+    "rank": 840
+  },
+  {
+    "url": "https://openreview.net/forum?id=AJY3fGPF1DC",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      6,
+      6,
+      4
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Selecting causal inference models for estimating individualized treatment effects (ITE) from observational data presents a unique challenge since the counterfactual outcomes are never observed. The problem is challenged further in the unsupervised domain adaptation (UDA) setting where we only have access to labeled samples in the source domain, but desire selecting a model that achieves good performance on a target domain for which only unlabeled samples are available. Existing techniques for UDA model selection are designed for the predictive setting. These methods examine discriminative density ratios between the input covariates in the source and target domain and do not factor in the model's predictions in the target domain. Because of this, two models with identical performance on the source domain would receive the same risk score by existing methods, but in reality, have significantly different performance in the test domain. We leverage the invariance of causal structures across domains to propose a novel model selection metric specifically designed for ITE methods under the UDA setting. In particular, we propose selecting models whose predictions of interventions' effects satisfy known causal structures in the target domain. Experimentally, our method selects ITE models that are more robust to covariate shifts on several healthcare datasets, including estimating the effect of ventilation in COVID-19 patients from different geographic locations.",
+    "title": "Selecting Treatment Effects Models for Domain Adaptation Using Causal Knowledge",
+    "authors": [
+      "Trent Kyono",
+      "Ioana Bica",
+      "Zhaozhi Qian",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Trent_Kyono1",
+      "~Ioana_Bica1",
+      "~Zhaozhi_Qian1",
+      "~Mihaela_van_der_Schaar2"
+    ],
+    "rank": 841
+  },
+  {
+    "url": "https://openreview.net/forum?id=Vd7lCMvtLqg",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Learning continuous representations of discrete objects such as text, users, movies, and URLs lies at the heart of many applications including language and user modeling. When using discrete objects as input to neural networks, we often ignore the underlying structures (e.g., natural groupings and similarities) and embed the objects independently into individual vectors. As a result, existing methods do not scale to large vocabulary sizes. In this paper, we design a simple and efficient embedding algorithm that learns a small set of anchor embeddings and a sparse transformation matrix. We call our method Anchor &amp; Transform (ANT) as the embeddings of discrete objects are a sparse linear combination of the anchors, weighted according to the transformation matrix. ANT is scalable, flexible, and end-to-end trainable. We further provide a statistical interpretation of our algorithm as a Bayesian nonparametric prior for embeddings that encourages sparsity and leverages natural groupings among objects. By deriving an approximate inference algorithm based on Small Variance Asymptotics, we obtain a natural extension that automatically learns the optimal number of anchors instead of having to tune it as a hyperparameter. On text classification, language modeling, and movie recommendation benchmarks, we show that ANT is particularly suitable for large vocabulary sizes and demonstrates stronger performance with fewer parameters (up to 40x compression) as compared to existing compression baselines.",
+    "title": "Anchor & Transform: Learning Sparse Embeddings for Large Vocabularies",
+    "authors": [
+      "Paul Pu Liang",
+      "Manzil Zaheer",
+      "Yuan Wang",
+      "Amr Ahmed"
+    ],
+    "emails": [
+      "~Paul_Pu_Liang1",
+      "~Manzil_Zaheer1",
+      "~Amr_Ahmed1",
+      "~Yuan_Wang1"
+    ],
+    "rank": 842
+  },
+  {
+    "url": "https://openreview.net/forum?id=lDjgALS4qs8",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we propose a unified explanation of representation for layer-aware neural sequence encoders, which regards the representation as a revisited multigraph called multi-order-graph (MoG), so that model encoding can be viewed as a processing to capture all subgraphs in MoG. The relationship reflected by Multi-order-graph, called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>-order dependency, can present what existing simple directed graph explanation cannot present. Our proposed MoG explanation allows to precisely observe every step of the generation of representation, put diverse relationship such as syntax into a unifiedly depicted framework. Based on the proposed MoG explanation, we further propose a graph-based self-attention network empowered Graph-Transformer by enhancing the ability of capturing subgraph information over the current models. Graph-Transformer accommodates different subgraphs into different groups, which allows model to focus on salient subgraphs. Result of experiments on neural machine translation tasks show that the MoG-inspired model can yield effective performance improvement.",
+    "title": "To Understand Representation of Layer-aware Sequence Encoders as Multi-order-graph",
+    "authors": [
+      "Sufeng Duan",
+      "hai zhao",
+      "Rui Wang"
+    ],
+    "emails": [
+      "~Sufeng_Duan1",
+      "~Rui_Wang10",
+      "~hai_zhao1"
+    ],
+    "rank": 843
+  },
+  {
+    "url": "https://openreview.net/forum?id=JHx9ZDCQEA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Polymers appear everywhere in our daily lives -- fabrics, plastics, rubbers, etc. --  and we could hardly live without them. To make polymers, chemists develop processes that combine smaller building blocks~(monomers) to form long chains or complex networks~(polymers). These processes are called polymerizations and will usually take lots of human efforts to develop. Although machine learning models for small molecules have generated lots of promising results, the prediction problem for polymerization is new and suffers from the scarcity of polymerization datasets available in the field. Furthermore, the problem is made even more challenging by the large size of the polymers and the additional recursive constraints, which are not present in the small molecule problem. In this paper, we make an initial step towards this challenge and propose a learning-based search framework that can automatically identify a sequence of reactions that lead to the polymerization of a target polymer with minimal polymerization data involved. Our method transfers models trained on small molecule datasets for retrosynthesis to check the validity of polymerization reaction. Furthermore, our method also incorporates a template prior learned on a limited amount of polymer data into the framework to adapt the model from small molecule to the polymer domain. We demonstrate that our method is able to propose high-quality polymerization plans for a dataset of 52 real-world polymers, of which a significant portion successfully recovers the currently-in-used polymerization processes in the real world.",
+    "title": "PolyRetro: Few-shot Polymer Retrosynthesis via Domain Adaptation",
+    "authors": [
+      "Binghong Chen",
+      "Chengtao Li",
+      "Hanjun Dai",
+      "Rampi Ramprasad",
+      "Le Song"
+    ],
+    "emails": [
+      "~Binghong_Chen1",
+      "~Le_Song1",
+      "gmail.com",
+      "~Hanjun_Dai1",
+      "~Chengtao_Li1"
+    ],
+    "rank": 844
+  },
+  {
+    "url": "https://openreview.net/forum?id=IG3jEGLN0jd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Contrastive learning is an approach to representation learning that utilizes naturally occurring similar and dissimilar pairs of data points to find useful embeddings of data. In the context of document classification under topic modeling assumptions, we prove that contrastive learning is capable of recovering a representation of documents that reveals their underlying topic posterior information to linear models. We apply this procedure in a semi-supervised setup and demonstrate empirically that linear classifiers with these representations perform well in document classification tasks with very few training examples.",
+    "title": "Contrastive estimation reveals topic posterior information to linear models",
+    "authors": [
+      "Christopher Tosh",
+      "Akshay Krishnamurthy",
+      "Daniel Hsu"
+    ],
+    "emails": [
+      "~Daniel_Hsu1",
+      "~Christopher_Tosh1",
+      "~Akshay_Krishnamurthy1"
+    ],
+    "rank": 845
+  },
+  {
+    "url": "https://openreview.net/forum?id=jjKzfD9vP9",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "The Mixup scheme of mixing a pair of samples to create an augmented training sample has gained much attention recently for better training of neural networks. A straightforward and widely used extension is to combine Mixup and regional dropout methods: removing random patches from a sample and replacing it with the features from another sample. Albeit their simplicity and effectiveness, these methods are prone to create harmful samples due to their randomness. In recent studies, attempts to prevent such a phenomenon by selecting only the most informative features are gradually emerging. However, this maximum saliency strategy acts against their fundamental duty of sample diversification as they always deterministically select regions with maximum saliency, injecting bias into the augmented data. To address this problem, we present Saliency Grafting, a novel Mixup-like data augmentation method that captures the best of both ways. By stochastically sampling the features and \u2018grafting\u2019 them onto another sample, our method effectively generates diverse yet meaningful samples. The second ingredient of Saliency Grafting is to produce the label of the grafted sample by mixing the labels in a saliency-calibrated fashion, which rectifies supervision misguidance introduced by the random sampling procedure. Our experiments under CIFAR and ImageNet datasets show that our scheme outperforms the current state-of-the-art augmentation strategies not only in terms of classification accuracy, but is also superior in coping under stress conditions such as data corruption and data scarcity. The code will be released.",
+    "title": "Saliency Grafting: Innocuous Attribution-Guided Mixup with Calibrated Label Mixing",
+    "authors": [
+      "Joonhyung Park",
+      "June Yong Yang",
+      "Jinwoo Shin",
+      "Sung Ju Hwang",
+      "Eunho Yang"
+    ],
+    "emails": [
+      "~Joonhyung_Park1",
+      "~Eunho_Yang1",
+      "~Jinwoo_Shin1",
+      "~June_Yong_Yang1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 846
+  },
+  {
+    "url": "https://openreview.net/forum?id=tHgJoMfy6nI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "The goal of continual learning (CL) is to learn a sequence of tasks without suffering from the phenomenon of catastrophic forgetting. Previous work has shown that leveraging memory in the form of a replay buffer can reduce performance degradation on prior tasks. We hypothesize that forgetting can be further reduced when the model is encouraged to remember the \\textit{evidence} for previously made decisions. As a first step towards exploring this hypothesis, we propose a simple novel training paradigm, called Remembering for the Right Reasons (RRR), that additionally stores visual model explanations for each example in the buffer and ensures the model has ``the right reasons'' for its predictions by encouraging its explanations to remain consistent with those used to make decisions at training time. Without this constraint, there is a drift in explanations and increase in forgetting as conventional continual learning algorithms learn new tasks. We demonstrate how RRR can be easily added to any memory or regularization-based approach and results in reduced forgetting, and more importantly, improved model explanations. We have evaluated our approach in the standard and few-shot settings and observed a consistent improvement across various CL approaches using different architectures and techniques to generate model explanations and demonstrated our approach showing a promising connection between explainability and continual learning. Our code is available at \\url{<a href=\"https://github.com/SaynaEbrahimi/Remembering-for-the-Right-Reasons}\" target=\"_blank\" rel=\"nofollow\">https://github.com/SaynaEbrahimi/Remembering-for-the-Right-Reasons}</a>.",
+    "title": "Remembering for the Right Reasons: Explanations Reduce Catastrophic Forgetting",
+    "authors": [
+      "Sayna Ebrahimi",
+      "Suzanne Petryk",
+      "Akash Gokul",
+      "William Gan",
+      "Joseph E. Gonzalez",
+      "Marcus Rohrbach",
+      "trevor darrell"
+    ],
+    "emails": [
+      "~trevor_darrell1",
+      "~Suzanne_Petryk1",
+      "~Joseph_E._Gonzalez1",
+      "~Sayna_Ebrahimi1",
+      "berkeley.edu",
+      "~Marcus_Rohrbach1"
+    ],
+    "rank": 847
+  },
+  {
+    "url": "https://openreview.net/forum?id=jMPcEkJpdD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Self-supervised learning of video representations has received great attention. Existing methods typically require frames to be decoded before being processed, which increases compute and storage requirements and ultimately hinders large-scale training. In this work, we propose an efficient self-supervised approach to learn video representations by eliminating the expensive decoding step. We use a three-stream video architecture that encodes I-frames and P-frames of a compressed video. Unlike existing approaches that encode I-frames and P-frames individually, we propose to jointly encode them by establishing bidirectional dynamic connections across streams. To enable self-supervised learning, we propose two pretext tasks that leverage the multimodal nature (RGB, motion vector, residuals) and the internal GOP structure of compressed videos. The first task asks our network to predict zeroth-order motion statistics in a spatio-temporal pyramid; the second task asks correspondence types between I-frames and P-frames after applying temporal transformations. We show that our approach achieves competitive performance on compressed video recognition both in supervised and self-supervised regimes. \n",
+    "title": "Self-Supervised Learning of Compressed Video Representations",
+    "authors": [
+      "Youngjae Yu",
+      "Sangho Lee",
+      "Gunhee Kim",
+      "Yale Song"
+    ],
+    "emails": [
+      "~Sangho_Lee1",
+      "~Youngjae_Yu1",
+      "~Yale_Song1",
+      "~Gunhee_Kim1"
+    ],
+    "rank": 848
+  },
+  {
+    "url": "https://openreview.net/forum?id=E8fmaZwzEj",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Robustness of convolutional neural networks (CNNs) has gained in importance on account of adversarial examples, i.e., inputs added as well-designed perturbations that are imperceptible to humans but can cause the model to predict incorrectly. Recent research suggests that the noise in adversarial examples breaks the textural structure, which eventually leads to wrong predictions. To mitigate the threat of such adversarial attacks, we propose defective convolutional networks that make predictions relying less on textural information but more on shape information by properly integrating defective convolutional layers into standard CNNs. The defective convolutional layers contain defective neurons whose activations are set to be a constant function. As defective neurons contain no information and are far different from standard neurons in its spatial neighborhood, the textural features cannot be accurately extracted, and so the model has to seek other features for classification, such as the shape. We show extensive evidence to justify our proposal and demonstrate that defective CNNs can defend against black-box attacks better than standard CNNs. In particular, they achieve state-of-the-art performance against transfer-based attacks without any adversarial training being applied.\n\n",
+    "title": "Defective Convolutional Networks",
+    "authors": [
+      "Tiange Luo",
+      "Tianle Cai",
+      "Mengxiao Zhang",
+      "Siyu Chen",
+      "Di He",
+      "Liwei Wang"
+    ],
+    "emails": [
+      "~Tiange_Luo1",
+      "~Liwei_Wang1",
+      "~Mengxiao_Zhang2",
+      "~Di_He1",
+      "~Tianle_Cai1",
+      "~Siyu_Chen1"
+    ],
+    "rank": 849
+  },
+  {
+    "url": "https://openreview.net/forum?id=OCm0rwa1lx1",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Transformers have been successfully applied to sequential tasks despite being feedforward networks. Unlike recurrent neural networks, Transformers use attention to capture temporal relations while processing input tokens in parallel. While this parallelization makes them computationally efficient,  it restricts the model from fully exploiting the sequential nature of the input. The representation at a given layer can only access representations from lower layers, rather than the higher level representations already available. In this work, we propose the Feedback Transformer architecture that exposes all previous representations to all future representations, meaning the lowest representation of the current timestep is formed from the highest-level abstract representation of the past. We demonstrate on a variety of benchmarks in language modeling, machine translation, and reinforcement learning that the increased representation capacity can create small, shallow models with much stronger performance than comparable Transformers.",
+    "title": "Addressing Some Limitations of Transformers with Feedback Memory",
+    "authors": [
+      "Angela Fan",
+      "Thibaut Lavril",
+      "Edouard Grave",
+      "Armand Joulin",
+      "Sainbayar Sukhbaatar"
+    ],
+    "emails": [
+      "~Angela_Fan2",
+      "~Armand_Joulin1",
+      "~Edouard_Grave1",
+      "~Sainbayar_Sukhbaatar1",
+      "fb.com"
+    ],
+    "rank": 850
+  },
+  {
+    "url": "https://openreview.net/forum?id=yFJ67zTeI2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Knowledge about the locations of keypoints of an object in an image can assist in fine-grained classification and identification tasks, particularly for the case of objects that exhibit large variations in poses that greatly influence their visual appearance, such as wild animals. However, supervised training of a keypoint detection network requires annotating a large image dataset for each animal species, which is a labor-intensive task. To reduce the need for labeled data, we propose to learn simultaneously keypoint heatmaps and pose invariant keypoint representations in a semi-supervised manner using a small set of labeled images along with a larger set of unlabeled images. Keypoint representations are learnt with a semantic keypoint consistency constraint that forces the keypoint detection network to learn similar features for the same keypoint across the dataset. Pose invariance is achieved by making keypoint representations for the image and its augmented copies closer together in feature space. Our semi-supervised approach significantly outperforms previous methods on several benchmarks for human and animal body landmark localization.",
+    "title": "Semi-supervised Keypoint Localization",
+    "authors": [
+      "Olga Moskvyak",
+      "Frederic Maire",
+      "Feras Dayoub",
+      "Mahsa Baktashmotlagh"
+    ],
+    "emails": [
+      "~Mahsa_Baktashmotlagh1",
+      "~Frederic_Maire1",
+      "~Feras_Dayoub1",
+      "~Olga_Moskvyak1"
+    ],
+    "rank": 851
+  },
+  {
+    "url": "https://openreview.net/forum?id=-gfhS00XfKj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      3,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Using transformers over large generated datasets, we train models to learn mathematical properties of differential systems, such as local stability, behavior at infinity and controllability. We achieve near perfect prediction of qualitative characteristics, and good approximations of numerical features of the system. This demonstrates that neural networks can learn to perform complex computations, grounded in advanced theory, from examples, without built-in mathematical knowledge.",
+    "title": "Learning advanced mathematical computations from examples",
+    "authors": [
+      "Francois Charton",
+      "Amaury Hayat",
+      "Guillaume Lample"
+    ],
+    "emails": [
+      "~Francois_Charton1",
+      "~Guillaume_Lample1",
+      "enpc.fr"
+    ],
+    "rank": 852
+  },
+  {
+    "url": "https://openreview.net/forum?id=MMXhHXbNsa-",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "With the wide adoption of machine learning algorithms across various application domains, there is a growing interest in the fairness properties of such algorithms. The vast majority of the activity in the field of group fairness addresses disparities between prede\ufb01ned groups based on protected features such as gender, age, and race, which need to be available at train, and often also at test, time. These approaches are static and retrospective, since algorithms designed to protect groups identified  a priori cannot anticipate and protect the needs of different at-risk groups in the future. In this work we analyze the space of solutions for worst-case fairness beyond demographics, and propose Blind Pareto Fairness (BPF), a method that leverages no-regret dynamics to recover a fair minimax classi\ufb01er that reduces worst-case risk of  any potential subgroup of suf\ufb01cient size, and guarantees that the remaining population receives the best possible level of service. BPF addresses  fairness beyond demographics, that is, it  does not rely on prede\ufb01ned notions of at-risk groups, neither at train nor at test time. Our experimental results show that the proposed framework improves worst-case risk in multiple standard datasets, while simultaneously providing better levels of service for the remaining population, in comparison to competing methods.",
+    "title": "Blind Pareto Fairness and Subgroup Robustness",
+    "authors": [
+      "Natalia Martinez",
+      "Martin Bertran",
+      "Afroditi Papadaki",
+      "Miguel R. D. Rodrigues",
+      "Guillermo Sapiro"
+    ],
+    "emails": [
+      "~Natalia_Martinez1",
+      "~Afroditi_Papadaki1",
+      "~Guillermo_Sapiro1",
+      "~Miguel_R._D._Rodrigues1",
+      "gmail.com"
+    ],
+    "rank": 853
+  },
+  {
+    "url": "https://openreview.net/forum?id=jYVY_piet7m",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Although the non-autoregressive translation model based on iterative refinement has achieved comparable performance to the autoregressive counterparts with faster decoding, we empirically found that such aggressive iterations make the acceleration rely heavily on small batch size (e.g., 1) and computing device (e.g., GPU).\nBy designing synthetic experiments, we highlight that iteration times can be significantly reduced when providing a good (partial) target context. \nInspired by this, we propose a two-stage translation prototype -- Hybrid-Regressive Translation (HRT). HRT first jumpily generates a discontinuous sequence by autoregression (e.g., make a prediction every k tokens, k&gt;1). Then, with the help of the partially deterministic target context, HRT fills all the previously skipped tokens with one iteration in a non-autoregressive way.\nThe experimental results on WMT'16 En-Ro and WMT'14 En-De show that our model outperforms the state-of-the-art non-autoregressive models with multiple iterations, even autoregressive models. Moreover, compared with autoregressive models, HRT can be steadily accelerated 1.5 times regardless of batch size and device.",
+    "title": "Hybrid-Regressive Neural Machine Translation",
+    "authors": [
+      "Qiang Wang",
+      "Heng Yu",
+      "Shaohui Kuang",
+      "Weihua Luo"
+    ],
+    "emails": [
+      "~Qiang_Wang8",
+      "~Shaohui_Kuang1",
+      "~Heng_Yu1",
+      "alibaba-inc.com"
+    ],
+    "rank": 854
+  },
+  {
+    "url": "https://openreview.net/forum?id=lEZIPgMIB1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7,
+      9
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "We propose Parametric  UMAP, a parametric variation of the UMAP (Uniform Manifold Approximation and Projection) algorithm. UMAP is a non-parametric graph-based dimensionality reduction algorithm using applied Riemannian geometry and algebraic topology to find low-dimensional embeddings of structured data. The UMAP algorithm consists of two steps: (1) Compute a graphical representation of a dataset (fuzzy simplicial complex), and (2) Through stochastic gradient descent, optimize a low-dimensional embedding of the graph. Here, we replace the second step of UMAP with a deep neural network that learns a parametric relationship between data and embedding. We demonstrate that our method performs similarly to its non-parametric counterpart while conferring the benefit of a learned parametric mapping (e.g. fast online embeddings for new data). We then show that UMAP loss can be extended to arbitrary deep learning applications, for example constraining the latent distribution of autoencoders, and improving classifier accuracy for semi-supervised learning by capturing structure in unlabeled data.",
+    "title": "Parametric UMAP: learning embeddings with deep neural networks for representation and semi-supervised learning",
+    "authors": [
+      "Tim Sainburg",
+      "Leland McInnes",
+      "Timothy Q Gentner"
+    ],
+    "emails": [
+      "~Tim_Sainburg1",
+      "gmail.com",
+      "ucsd.edu"
+    ],
+    "rank": 855
+  },
+  {
+    "url": "https://openreview.net/forum?id=JCz05AtXO3y",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks are promising architecture for learning and inference with graph-structured data. However, generating informative graph level features has long been a challenge.  Current practice of  graph-pooling typically summarizes a graph by squeezing it into a single vector. This may lead to significant loss of predictive, iterpretable structural information, because properties of a complex system are believed to arise largely from the interaction among its components. In this paper, we analyze the intrinsic difficulty in graph classification under the unified concept of ``\"resolution dilemmas\" and propose `SLIM, an inductive neural network model for Structural Landmarking and Interaction Modelling, to remedy the information loss in graph pooling. We show that, by projecting graphs onto end-to-end optimizable, and well-aligned substructure landmarks (representatives), the resolution dilemmas can be resolved effectively, so that explicit interacting relation between component parts of a graph can be leveraged directly in explaining its complexity and predicting its property. Empirical evaluations, in comparison with state-of-the-art, demonstrate promising results of our approach on a number of benchmark datasets for graph classification.\n",
+    "title": "Structural Landmarking and Interaction Modelling: on Resolution Dilemmas in Graph Classification",
+    "authors": [
+      "Kai Zhang",
+      "Yaokang Zhu",
+      "Jun Wang",
+      "Haibin Ling",
+      "Jie Zhang",
+      "Hongyuan Zha"
+    ],
+    "emails": [
+      "~Haibin_Ling1",
+      "~Jun_Wang4",
+      "~Hongyuan_Zha1",
+      "~Jie_Zhang10",
+      "~Kai_Zhang1",
+      "~Yaokang_Zhu1"
+    ],
+    "rank": 856
+  },
+  {
+    "url": "https://openreview.net/forum?id=tYxG_OMs9WE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Deep generative models have made important progress towards modeling complex, high dimensional data via learning latent representations. Their usefulness is nevertheless often limited by a lack of control over the generative process or a poor understanding of the latent representation. To overcome these issues, attention is now focused on discovering latent variables correlated to the data properties and ways to manipulate these properties. This paper presents the new Property controllable VAE (PCVAE), where a new Bayesian model is proposed to inductively bias the latent representation using explicit data properties via novel group-wise and property-wise disentanglement. Each data property corresponds seamlessly to a latent variable, by innovatively enforcing invertible mutual dependence between them. This allows us to move along the learned latent dimensions to control specific properties of the generated data with great precision. Quantitative and qualitative evaluations confirm that the PCVAE outperforms the existing models by up to 28% in capturing and 65% in manipulating the desired properties.",
+    "title": "Property Controllable Variational Autoencoder via Invertible Mutual Dependence",
+    "authors": [
+      "Xiaojie Guo",
+      "Yuanqi Du",
+      "Liang Zhao"
+    ],
+    "emails": [
+      "~Yuanqi_Du1",
+      "~Liang_Zhao1",
+      "~Xiaojie_Guo1"
+    ],
+    "rank": 857
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y9McSeEaqUh",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Multiclass classifiers are often designed and evaluated only on a sample from the classes on which they will eventually be applied. Hence, their final accuracy remains unknown. In this work we study how a classifier\u2019s performance over the initial class sample can be used to extrapolate its expected accuracy on a larger, unobserved set of classes. For this, we define a measure of separation between correct and incorrect classes that is independent of the number of classes: the \"reversed ROC\" (rROC), which is obtained by replacing the roles of classes and data-points in the common ROC. We show that the classification accuracy is a function of the rROC in multiclass classifiers, for which the learned representation of data from the initial class sample remains unchanged when new classes are added. Using these results we formulate a robust neural-network-based algorithm, \"CleaneX\", which learns to estimate the accuracy of such classifiers on arbitrarily large sets of classes. Unlike previous methods, our method uses both the observed accuracies of the classifier and densities of classification scores, and therefore achieves remarkably better predictions than current state-of-the-art methods on both simulations and real datasets of object detection, face recognition, and brain decoding.",
+    "title": "Predicting Classification Accuracy When Adding New Unobserved Classes",
+    "authors": [
+      "Yuli Slavutsky",
+      "Yuval Benjamini"
+    ],
+    "emails": [
+      "~Yuli_Slavutsky1",
+      "~Yuval_Benjamini1"
+    ],
+    "rank": 858
+  },
+  {
+    "url": "https://openreview.net/forum?id=gZ2qq0oPvJR",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      8
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      2,
+      3
+    ],
+    "abstract": "We present a reinforcement learning (RL) based guidance system for automated theorem proving geared towards Finding Longer Proofs (FLoP). FLoP is a step towards learning to reason by analogy, reducing the dependence on large scale search in automated theorem provers. We use several simple, structured datasets with very long proofs to show that FLoP can successfully generalise a single training proof to a large class of related problems, implementing a simple form of analogical reasoning. On these benchmarks, FLoP is competitive with strong theorem provers despite using very limited search.",
+    "title": "Towards Finding Longer Proofs",
+    "authors": [
+      "Zsolt Zombori",
+      "Adri\u00e1n Csisz\u00e1rik",
+      "Henryk Michalewski",
+      "Cezary Kaliszyk",
+      "Josef Urban"
+    ],
+    "emails": [
+      "~Josef_Urban1",
+      "~Adri\u00e1n_Csisz\u00e1rik1",
+      "~Henryk_Michalewski1",
+      "~Zsolt_Zombori1",
+      "~Cezary_Kaliszyk1"
+    ],
+    "rank": 859
+  },
+  {
+    "url": "https://openreview.net/forum?id=piek7LGx7j",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Current autoencoder-based disentangled representation learning methods achieve disentanglement by penalizing the (aggregate) posterior to encourage statistical independence of the latent factors. This approach introduces a trade-off between disentangled representation learning and reconstruction quality since the model does not have enough capacity to learn correlated latent variables that capture detail information present in most image data. To overcome this trade-off, we present a novel multi-stage modelling approach where the disentangled factors are first learned using a preexisting disentangled representation learning method (such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-TCVAE); then, the low-quality reconstruction is improved with another deep generative model that is trained to model the missing correlated latent variables, adding detail information while maintaining conditioning on the previously learned disentangled factors. Taken together, our multi-stage modelling approach results in single, coherent probabilistic model that is theoretically justified by the principal of D-separation and can be realized with a variety of model classes including likelihood-based models such as variational autoencoders, implicit models such as generative adversarial networks, and tractable models like normalizing flows or mixtures of Gaussians. We demonstrate that our multi-stage model has much higher reconstruction quality than current state-of-the-art methods with equivalent disentanglement performance across multiple standard benchmarks.",
+    "title": "Improving the Reconstruction of Disentangled Representation Learners via Multi-Stage Modelling",
+    "authors": [
+      "Akash Srivastava",
+      "Yamini Bansal",
+      "Yukun Ding",
+      "Cole Lincoln Hurwitz",
+      "Kai Xu",
+      "Bernhard Egger",
+      "Prasanna Sattigeri",
+      "Joshua B. Tenenbaum",
+      "Dan Gutfreund"
+    ],
+    "emails": [
+      "~Kai_Xu4",
+      "~Akash_Srivastava1",
+      "~Cole_Lincoln_Hurwitz1",
+      "~Yukun_Ding1",
+      "~Bernhard_Egger1",
+      "~Joshua_B._Tenenbaum1",
+      "~Dan_Gutfreund1",
+      "~Prasanna_Sattigeri1",
+      "~Yamini_Bansal1"
+    ],
+    "rank": 860
+  },
+  {
+    "url": "https://openreview.net/forum?id=bG_lJcLwE3p",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Image manipulation has attracted much research over the years due to the popularity and commercial importance of the task. In recent years, deep neural network methods have been proposed for many image manipulation tasks. A major issue with deep methods is the need to train on large amounts of data from the same distribution as the target image, whereas collecting datasets encompassing the entire long-tail of images is impossible. In this paper, we demonstrate that simply training a conditional adversarial generator on the single target image is sufficient for performing complex image manipulations. We find that the key for enabling single image training is extensive augmentation of the input image and provide a novel augmentation method. Our network learns to map between a primitive representation of the image (e.g. edges and segmentation) to the image itself. At manipulation time, our generator allows for making general image changes by modifying the primitive input representation and mapping it through the network. We extensively evaluate our method and find that it provides remarkable performance. ",
+    "title": "Deep Single Image Manipulation",
+    "authors": [
+      "Yael Vinker",
+      "Eliahu Horwitz",
+      "Nir Zabari",
+      "Yedid Hoshen"
+    ],
+    "emails": [
+      "~Yedid_Hoshen3",
+      "~Yael_Vinker1",
+      "~Nir_Zabari1",
+      "mail.huji.ac.il"
+    ],
+    "rank": 861
+  },
+  {
+    "url": "https://openreview.net/forum?id=Du7s5ukNKz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Most existing policy learning solutions require the learning agents to receive high-quality supervision signals, e.g., rewards in reinforcement learning (RL) or high-quality expert demonstrations in behavior cloning (BC). These quality supervisions are either infeasible or prohibitively expensive to obtain in practice. We aim for a unified framework that leverages the weak supervisions to perform policy learning efficiently. To handle this problem, we treat the ``weak supervisions'' as imperfect information coming from a \\emph{peer agent}, and evaluate the learning agent's policy based on a  ``correlated agreement'' with the peer agent's policy (instead of simple agreements). Our way of leveraging peer agent's information offers us a family of solutions that learn effectively from weak supervisions with theoretical guarantees. Extensive evaluations on tasks including RL with noisy reward, BC with weak demonstrations, and standard policy co-training (RL + BC) show that the proposed approach leads to substantial improvements, especially when the complexity or the noise of the learning environments grows. ",
+    "title": "Policy Learning Using Weak Supervision",
+    "authors": [
+      "Jingkang Wang",
+      "Hongyi Guo",
+      "Zhaowei Zhu",
+      "Yang Liu"
+    ],
+    "emails": [
+      "~Jingkang_Wang1",
+      "~Hongyi_Guo1",
+      "~Yang_Liu3",
+      "~Zhaowei_Zhu1"
+    ],
+    "rank": 862
+  },
+  {
+    "url": "https://openreview.net/forum?id=1UtnrqVUeNE",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "As neural network classifiers are deployed in real-world applications, it is crucial that their predictions are not just accurate, but trustworthy as well. One practical solution is to assign confidence scores to each prediction, then filter out low-confidence predictions. However, existing confidence metrics are not yet sufficiently reliable for this role. This paper presents a new framework that produces more reliable confidence scores for detecting misclassification errors. This framework, RED, calibrates the classifier's inherent confidence indicators and estimates uncertainty of the calibrated confidence scores using Gaussian Processes. Empirical comparisons with other confidence estimation methods on 125 UCI datasets demonstrate that this approach is effective. An experiment on a vision task with a large deep learning architecture further confirms that the method can scale up, and a case study involving out-of-distribution and adversarial samples shows potential of the proposed method to improve robustness of neural network classifiers more broadly in the future.",
+    "title": "Detecting Misclassification Errors in Neural Networks with a Gaussian Process Model",
+    "authors": [
+      "Xin Qiu",
+      "Risto Miikkulainen"
+    ],
+    "emails": [
+      "~Xin_Qiu1",
+      "~Risto_Miikkulainen1"
+    ],
+    "rank": 863
+  },
+  {
+    "url": "https://openreview.net/forum?id=Hrtbm8u0RXu",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "It is known that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c398\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u0398</mi><mo stretchy=\"false\">(</mo><mi>N</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> parameters are sufficient for neural networks to memorize arbitrary <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> input-label pairs. By exploiting depth, we show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c398\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u0398</mi><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mrow><mn>2</mn><mrow><mo>/</mo></mrow><mn>3</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> parameters suffice to memorize <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> pairs, under a mild condition on the separation of input points. In particular, deeper networks (even with width <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn></math></mjx-assistive-mml></mjx-container>) are shown to memorize more pairs than shallow networks, which also agrees with the recent line of works on the benefits of depth for function approximation. We also provide empirical results that support our theoretical findings.",
+    "title": "Provable Memorization via Deep Neural Networks using Sub-linear Parameters",
+    "authors": [
+      "Sejun Park",
+      "Jaeho Lee",
+      "Chulhee Yun",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Chulhee_Yun1",
+      "~Sejun_Park1",
+      "~Jaeho_Lee3",
+      "~Jinwoo_Shin1"
+    ],
+    "rank": 864
+  },
+  {
+    "url": "https://openreview.net/forum?id=6YEQUn0QICG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      7,
+      4
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "The emerging paradigm of federated learning (FL) strives to enable collaborative training of deep models on the network edge without centrally aggregating raw data and hence improving data privacy. In most cases, the assumption of independent and identically distributed samples across local clients does not hold for federated learning setups. Under this setting, neural network training performance may vary significantly according to the data distribution and even hurt training convergence. Most of the previous work has focused on a difference in the distribution of labels or client shifts. Unlike those settings, we address an important problem of FL, e.g., different scanners/sensors in medical imaging, different scenery distribution in autonomous driving (highway vs. city), where local clients store examples with different distributions compared to other clients, which we denote as feature shift non-iid. In this work, we propose an effective method that uses local batch normalization to alleviate the feature shift before averaging models. The resulting scheme, called FedBN, outperforms both classical FedAvg, as well as the state-of-the-art for non-iid data (FedProx) on our extensive experiments. These empirical results are supported by a convergence analysis that shows in a simplified setting that FedBN has a faster convergence rate than FedAvg. Code is available at <a href=\"https://github.com/med-air/FedBN\" target=\"_blank\" rel=\"nofollow\">https://github.com/med-air/FedBN</a>.",
+    "title": "FedBN: Federated Learning on Non-IID Features via Local Batch Normalization",
+    "authors": [
+      "Xiaoxiao Li",
+      "Meirui JIANG",
+      "Xiaofei Zhang",
+      "Michael Kamp",
+      "Qi Dou"
+    ],
+    "emails": [
+      "~Qi_Dou2",
+      "~Xiaoxiao_Li1",
+      "~Meirui_JIANG1",
+      "~Xiaofei_Zhang1",
+      "~Michael_Kamp1"
+    ],
+    "rank": 865
+  },
+  {
+    "url": "https://openreview.net/forum?id=XdprrZhBk8",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We show that the error of iteratively-pruned networks empirically follows a scaling law with interpretable coefficients that depend on the architecture and task. We functionally approximate the error of the pruned networks, showing that it is predictable in terms of an invariant tying width, depth, and pruning level, such that networks of vastly different sparsities are freely interchangeable. We demonstrate the accuracy of this functional approximation over scales spanning orders of magnitude in depth, width, dataset size, and sparsity. We show that the scaling law functional form holds (generalizes) for large scale data (CIFAR-10, ImageNet), architectures (ResNets, VGGs) and iterative pruning algorithms (IMP, SynFlow). As neural networks become ever larger and more expensive to train, our findings suggest a framework for reasoning conceptually and analytically about pruning.",
+    "title": "On the Predictability of Pruning Across Scales",
+    "authors": [
+      "Jonathan S Rosenfeld",
+      "Jonathan Frankle",
+      "Michael Carbin",
+      "Nir Shavit"
+    ],
+    "emails": [
+      "~Nir_Shavit1",
+      "~Jonathan_Frankle1",
+      "~Jonathan_S_Rosenfeld1",
+      "~Michael_Carbin1"
+    ],
+    "rank": 866
+  },
+  {
+    "url": "https://openreview.net/forum?id=bgQek2O63w",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      7,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent work discovered that training models to be invariant to adversarial perturbations requires substantially larger datasets than those required for standard classification. Perhaps more surprisingly, these larger datasets can be \"mostly\" unlabeled. Pseudo-labeling, a technique simultaneously pioneered by four separate and simultaneous works in 2019, has been proposed as a competitive alternative to labeled data for training adversarially robust models. However, when the amount of labeled data decreases, the performance of pseudo-labeling catastrophically drops, thus questioning the theoretical insights put forward by Uesato et al. (2019), which suggest that the sample complexity for learning an adversarially robust model from unlabeled data should match the fully supervised case. We introduce Bootstrap Your Own Robust Latents (BYORL), a self-supervised learning technique based on BYOL for training adversarially robust models. Our method enables us to train robust representations without any labels (reconciling practice with theory). Most notably, this robust representation can be leveraged by a linear classifier to train adversarially robust models, even when the linear classifier is not trained adversarially. We evaluate BYORL and pseudo-labeling on CIFAR-10 and ImageNet and demonstrate that BYORL achieves significantly higher robustness (i.e., models resulting from BYORL are up to two times more accurate). Experiments on CIFAR-10 against <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> norm-bounded perturbations demonstrate that BYORL achieves near state-of-the-art robustness with as little as 500 labeled examples. We also note that against <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> norm-bounded perturbations of size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi><mo>=</mo><mn>128</mn><mrow><mo>/</mo></mrow><mn>255</mn></math></mjx-assistive-mml></mjx-container>, BYORL surpasses the known state-of-the-art with an accuracy under attack of 77.61% (against 72.91% for the prior art).",
+    "title": "Self-supervised Adversarial Robustness for the Low-label, High-data Regime",
+    "authors": [
+      "Sven Gowal",
+      "Po-Sen Huang",
+      "Aaron van den Oord",
+      "Timothy Mann",
+      "Pushmeet Kohli"
+    ],
+    "emails": [
+      "~Aaron_van_den_Oord2",
+      "~Sven_Gowal2",
+      "~Pushmeet_Kohli1",
+      "~Po-Sen_Huang1",
+      "~Timothy_Mann1"
+    ],
+    "rank": 867
+  },
+  {
+    "url": "https://openreview.net/forum?id=wOI9hqkvu_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Despite the great success of Generative Adversarial Networks (GANs) in generating high-quality images, GANs for text generation still face two major challenges: first, most text GANs are unstable in training mainly due to ineffective optimization of the generator, and they heavily rely on maximum likelihood pretraining; second, most text GANs adopt autoregressive generators without latent variables, which largely limits the ability to learn latent representations for natural language text. In this paper, we propose a novel text GAN, named NAGAN, which incorporates a non-autoregressive generator with latent variables. The non-autoregressive generator can be effectively trained with gradient-based methods and free of pretraining. The latent variables facilitate representation learning for text generation applications. Experiments show that our model is competitive comparing with existing text GANs in unconditional text generation, and it outperforms existing methods on sentence manipulation in latent space and unsupervised text decipherment.",
+    "title": "A Text GAN for Language Generation with Non-Autoregressive Generator",
+    "authors": [
+      "Fei Huang",
+      "Jian Guan",
+      "Pei Ke",
+      "Qihan Guo",
+      "Xiaoyan Zhu",
+      "Minlie Huang"
+    ],
+    "emails": [
+      "~Jian_Guan1",
+      "~Fei_Huang3",
+      "~Minlie_Huang1",
+      "~Xiaoyan_Zhu1",
+      "~Pei_Ke2",
+      "~Qihan_Guo1"
+    ],
+    "rank": 868
+  },
+  {
+    "url": "https://openreview.net/forum?id=7FNqrcPtieT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      2
+    ],
+    "abstract": "Recently proposed consistency-based Semi-Supervised Learning (SSL) methods such as the Pi-model, temporal ensembling, the mean teacher, or the virtual adversarial training, achieve the state of the art results in several SSL tasks. These methods can typically reach performances that are comparable to their fully supervised counterparts while using only a fraction of labelled examples. Despite these methodological advances, the understanding of these methods is still relatively limited. To make progress, we analyse (variations of) the Pi-model in settings where analytically tractable results can be obtained. We establish links with Manifold Tangent Classifiers and demonstrate that the quality of the perturbations is key to obtaining reasonable SSL performances. Furthermore, we propose a simple extension of the Hidden Manifold Model that naturally incorporates data-augmentation schemes and offers a tractable framework for understanding SSL methods.",
+    "title": "On Data-Augmentation and Consistency-Based Semi-Supervised Learning",
+    "authors": [
+      "Atin Ghosh",
+      "Alexandre H. Thiery"
+    ],
+    "emails": [
+      "~Atin_Ghosh1",
+      "~Alexandre_H._Thiery1"
+    ],
+    "rank": 869
+  },
+  {
+    "url": "https://openreview.net/forum?id=3YdNZD5dMxI",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      8,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Coupling the high-fidelity generation capabilities of label-conditional image synthesis methods with the flexibility of unconditional generative models, we propose a semantic bottleneck GAN model for unconditional synthesis of complex scenes. We assume pixel-wise segmentation labels are available during training and use them to learn the scene structure through an unconditional progressive segmentation generation network. During inference, our model first synthesizes a realistic segmentation layout from scratch, then synthesizes a realistic scene conditioned on that layout through a conditional segmentation-to-image synthesis network. When trained end-to-end, the resulting model outperforms state-of-the-art generative models in unsupervised image synthesis on two challenging domains in terms of the Frechet Inception Distance and perceptual evaluations. Moreover, we demonstrate that the end-to-end training significantly improves the segmentation-to-image synthesis sub-network, which results in superior performance over the state-of-the-art when conditioning on real segmentation layouts.",
+    "title": "Unconditional Synthesis of Complex Scenes Using a Semantic Bottleneck",
+    "authors": [
+      "Samaneh Azadi",
+      "Michael Tschannen",
+      "Eric Tzeng",
+      "Sylvain Gelly",
+      "Trevor Darrell",
+      "Mario Lucic"
+    ],
+    "emails": [
+      "~Mario_Lucic1",
+      "~Samaneh_Azadi1",
+      "~Sylvain_Gelly1",
+      "~Trevor_Darrell2",
+      "~Eric_Tzeng1",
+      "~Michael_Tschannen1"
+    ],
+    "rank": 870
+  },
+  {
+    "url": "https://openreview.net/forum?id=D2Fp_qheYu",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      2,
+      3
+    ],
+    "abstract": "We propose the max-sliced Bures distance, a lower bound on the max-sliced Wasserstein-2 distance, to identify the instances associated with the maximum discrepancy between two samples. The max-slicing can be decomposed into two asymmetric divergences each expressed in terms of an optimal slice or equivalently a witness function that has large magnitude evaluations on a localized subset of instances in one distribution versus the other. We show how witness functions can be used to detect and correct for covariate shift through reweighting and to evaluate generative adversarial networks.  Unlike heuristic algorithms for the max-sliced Wasserstein-2 distance that may fail to find the optimal slice, we detail a tractable algorithm that finds the global optimal slice and scales to large sample sizes.  As the Bures distance quantifies differences in covariance, we generalize the max-sliced Bures distance by using non-linear mappings, enabling it to capture changes in higher-order statistics. We explore two types of non-linear mappings: positive semidefinite kernels where the witness functions belong to a reproducing kernel Hilbert space, and task-relevant mappings corresponding to a neural network. In the context of samples of natural images, our approach provides an interpretation of the Fr\u00e9chet Inception distance by identifying the synthetic and natural instances that are either over-represented or under-represented with respect to the other sample. We apply the proposed measure to detect imbalances in class distributions in various data sets and to critique generative models.",
+    "title": "Max-sliced Bures Distance for Interpreting Discrepancies",
+    "authors": [
+      "Austin J. Brockmeier",
+      "Claudio Cesar Claros",
+      "Carlos H. Mendoza-Cardenas",
+      "Y\u00fcksel Karahan",
+      "Matthew S. Emigh",
+      "Luis Gonzalo Sanchez Giraldo"
+    ],
+    "emails": [
+      "udel.edu",
+      "~Austin_J._Brockmeier1",
+      "navy.mil",
+      "~Luis_Gonzalo_Sanchez_Giraldo2"
+    ],
+    "rank": 871
+  },
+  {
+    "url": "https://openreview.net/forum?id=ww-7bdU6GA9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      2
+    ],
+    "abstract": "Transfer learning is an essential technique when sufficient data comes from the source domain, while no or scarce labeled data is from the target domain. We develop estimators that achieve minimax linear risk for linear regression problems under the distribution shift. Our algorithms cover different kinds of settings with covariate shift or model shift. We also consider when data are generating from either linear or general nonlinear models. We show that affine minimax rules are within an absolute constant of the minimax risk even among nonlinear rules, for a variety of source/target distributions.",
+    "title": "Near-Optimal Linear Regression under Distribution Shift",
+    "authors": [
+      "Qi Lei",
+      "Wei Hu",
+      "Jason D. Lee"
+    ],
+    "emails": [
+      "~Jason_D._Lee1",
+      "~Qi_Lei1",
+      "~Wei_Hu1"
+    ],
+    "rank": 872
+  },
+  {
+    "url": "https://openreview.net/forum?id=z9k8BWL-_2u",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "As we rely on machine learning (ML) models to make more consequential decisions, the issue of ML models perpetuating unwanted social biases has come to the fore of the public's and the research community's attention. In this paper, we focus on the problem of detecting violations of individual fairness in ML models. We formalize the problem as measuring the susceptibility of ML models against a form of adversarial attack and develop a suite of inference tools for the adversarial loss. The tools allow practitioners to assess the individual fairness of ML models in a statistically-principled way: form confidence intervals for the adversarial loss and test hypotheses of model fairness with (asymptotic) non-coverage/Type I error rate control. We demonstrate the utility of our tools in a real-world case study.",
+    "title": "Statistical inference for individual fairness",
+    "authors": [
+      "Subha Maity",
+      "Songkai Xue",
+      "Mikhail Yurochkin",
+      "Yuekai Sun"
+    ],
+    "emails": [
+      "~Subha_Maity1",
+      "umich.edu",
+      "~Mikhail_Yurochkin1",
+      "~Yuekai_Sun1"
+    ],
+    "rank": 873
+  },
+  {
+    "url": "https://openreview.net/forum?id=D2TE6VTJG9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Self-supervised representation learning solves auxiliary prediction tasks (known as pretext tasks), that do not require labeled data, to learn semantic representations. These pretext tasks are created solely using the input features, such as predicting a missing image patch, recovering the color channels of an image from context, or predicting missing words, yet predicting this \\textit{known} information helps in learning representations effective for downstream prediction tasks. This paper posits a mechanism based on approximate conditional independence to formalize how solving certain pretext tasks can learn representations that provably decrease the sample complexity of downstream supervised tasks. Formally, we quantify how the approximate independence between the components of the pretext task (conditional on the label and latent variables) allows us to learn representations that can solve the downstream task with drastically reduced sample complexity by just training a linear layer on top of the learned representation. ",
+    "title": "Predicting What You Already Know Helps: Provable Self-Supervised Learning",
+    "authors": [
+      "Jason D. Lee",
+      "Qi Lei",
+      "Nikunj Saunshi",
+      "Jiacheng Zhuo"
+    ],
+    "emails": [
+      "~Nikunj_Saunshi1",
+      "~Jason_D._Lee1",
+      "~Qi_Lei1",
+      "~Jiacheng_Zhuo1"
+    ],
+    "rank": 874
+  },
+  {
+    "url": "https://openreview.net/forum?id=jphnJNOwe36",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Overparameterised neural networks have demonstrated the remarkable ability to perfectly fit training samples, while still generalising to unseen test samples. However, several recent works have revealed that such models' good average performance does not always translate to good worst-case performance: in particular, they may perform poorly on subgroups that are under-represented in the training set. In this paper, we show that in certain settings, overparameterised models' performance on under-represented subgroups may be improved via post-hoc processing. Specifically, such models' bias can be restricted to their classification layers, and manifest as structured prediction shifts for rare subgroups. We detail two post-hoc correction techniques to mitigate this bias, which operate purely on the outputs of standard model training. We empirically verify that with such post-hoc correction, overparameterisation can improve average and worst-case performance.",
+    "title": "Overparameterisation and worst-case generalisation: friend or foe?",
+    "authors": [
+      "Aditya Krishna Menon",
+      "Ankit Singh Rawat",
+      "Sanjiv Kumar"
+    ],
+    "emails": [
+      "~Ankit_Singh_Rawat1",
+      "~Aditya_Krishna_Menon1",
+      "~Sanjiv_Kumar1"
+    ],
+    "rank": 875
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qm7R_SdqTpT",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Generating future frames given a few context (or past) frames is a challenging task. It requires modeling the temporal coherence of videos as well as multi-modality in terms of diversity in the potential future states. Current variational approaches for video generation tend to marginalize over multi-modal future outcomes. Instead, we propose to explicitly model the multi-modality in the future outcomes and leverage it to sample diverse futures. Our approach, Diverse Video Generator, uses a GP to learn priors on future states given the past and maintains a probability distribution over possible futures given a particular sample. We leverage the changes in this distribution over time to control the sampling of diverse future states by estimating the end of on-going sequences. In particular, we use the variance of GP over the output function space to trigger a change in the action sequence. We achieve state-of-the-art results on diverse future frame generation in terms of reconstruction quality and diversity of the generated sequences.",
+    "title": "Diverse Video Generation using a Gaussian Process Trigger",
+    "authors": [
+      "Gaurav Shrivastava",
+      "Abhinav Shrivastava"
+    ],
+    "emails": [
+      "~Abhinav_Shrivastava2",
+      "~Gaurav_Shrivastava1"
+    ],
+    "rank": 876
+  },
+  {
+    "url": "https://openreview.net/forum?id=GwjkaD3g-V1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Representing scenes at the granularity of objects is a prerequisite for scene understanding and decision making. We propose a novel approach for learning multi-object 3D scene representations from images. A recurrent encoder regresses a latent representation of 3D shapes, poses and texture of each object from an input RGB image. The 3D shapes are represented continuously in function-space as signed distance functions (SDF) which we efficiently pre-train from example shapes. By differentiable rendering, we train our model to decompose scenes self-supervised from RGB-D images. Our approach learns to decompose images into the constituent objects of the scene and to infer their shape, pose and texture properties from a single view. In experiments, we evaluate the accuracy of our model in inferring the 3D scene layout and demonstrate the capabilities of the generative 3D scene model.",
+    "title": "Semi-Supervised Learning of Multi-Object 3D Scene Representations",
+    "authors": [
+      "Cathrin Elich",
+      "Martin R. Oswald",
+      "Marc Pollefeys",
+      "Joerg Stueckler"
+    ],
+    "emails": [
+      "~Martin_R._Oswald1",
+      "~Joerg_Stueckler2",
+      "~Marc_Pollefeys2",
+      "~Cathrin_Elich1"
+    ],
+    "rank": 877
+  },
+  {
+    "url": "https://openreview.net/forum?id=0cmMMy8J5q",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural Architecture Search (NAS) is quickly becoming the standard methodology to design neural network models. However, NAS is typically compute-intensive because multiple models need to be evaluated before choosing the best one. To reduce the computational power and time needed, a proxy task is often used for evaluating each model instead of full training. In this paper, we evaluate conventional reduced-training proxies and quantify how well they preserve ranking between neural network models during search when compared with the rankings produced by final trained accuracy. We propose a series of zero-cost proxies, based on recent pruning literature, that use just a single minibatch of training data to compute a model's score. Our zero-cost proxies use 3 orders of magnitude less computation but can match and even outperform conventional proxies. For example, Spearman's rank correlation coefficient between final validation accuracy and our best zero-cost proxy on NAS-Bench-201 is 0.82, compared to 0.61 for EcoNAS (a recently proposed reduced-training proxy). Finally, we use these zero-cost proxies to enhance existing NAS search algorithms such as random search, reinforcement learning, evolutionary search and predictor-based search. For all search methodologies and across three different NAS datasets, we are able to significantly improve sample efficiency, and thereby decrease computation, by using our zero-cost proxies. For example on NAS-Bench-101, we achieved the same accuracy 4<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> quicker than the best previous result. Our code is made public at: <a href=\"https://github.com/mohsaied/zero-cost-nas\" target=\"_blank\" rel=\"nofollow\">https://github.com/mohsaied/zero-cost-nas</a>.",
+    "title": "Zero-Cost Proxies for Lightweight NAS",
+    "authors": [
+      "Mohamed S Abdelfattah",
+      "Abhinav Mehrotra",
+      "\u0141ukasz Dudziak",
+      "Nicholas Donald Lane"
+    ],
+    "emails": [
+      "~\u0141ukasz_Dudziak1",
+      "~Mohamed_S_Abdelfattah1",
+      "samsung.com",
+      "~Nicholas_Donald_Lane1"
+    ],
+    "rank": 878
+  },
+  {
+    "url": "https://openreview.net/forum?id=jQUf0TmN-oT",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      1,
+      3
+    ],
+    "abstract": " There has been a booming demand for integrating Convolutional Neural Networks (CNNs) powered functionalities into Internet-of-Thing (IoT) devices to enable ubiquitous intelligent \"IoT cameras\u201d. However, more extensive applications of such IoT systems are still limited by two challenges. First, some applications, especially medicine- and wearable-related ones, impose stringent requirements on the camera form factor. Second, powerful CNNs often require considerable storage and energy cost, whereas IoT devices often suffer from limited resources. PhlatCam, with its form factor potentially reduced by orders of magnitude, has emerged as a promising solution to the first aforementioned challenge, while the second one remains a bottleneck. Existing compression techniques, which can potentially tackle the second challenge, are far from realizing the full potential in storage and energy reduction, because they mostly focus on the CNN algorithm itself. To this end, this work proposes SACoD, a Sensor Algorithm Co-Design framework to develop more efficient CNN-powered PhlatCam. In particular, the mask coded in the PhlatCam  sensor and the backend CNN model are jointly optimized in terms of both model parameters and architectures via differential neural architecture search. Extensive experiments including both simulation and physical measurement on manufactured masks show that the proposed SACoD framework achieves aggressive model compression and energy savings while maintaining or even boosting the task accuracy, when benchmarking over two state-of-the-art (SOTA) designs with six datasets on four different tasks. We also perform visualization for better understanding the superiority of SACoD generated designs. All the codes will be released publicly upon acceptance. ",
+    "title": "SACoD: Sensor Algorithm Co-Design Towards Efficient CNN-powered Intelligent PhlatCam",
+    "authors": [
+      "Yonggan Fu",
+      "Yang Zhang",
+      "Yue Wang",
+      "Zhihan Lu",
+      "Vivek Boominathan",
+      "Ashok Veeraraghavan",
+      "Yingyan Lin"
+    ],
+    "emails": [
+      "~Yingyan_Lin1",
+      "~Yue_Wang3",
+      "~Ashok_Veeraraghavan1",
+      "~Vivek_Boominathan1",
+      "~Yonggan_Fu1",
+      "~Zhihan_Lu1",
+      "~Yang_Zhang3"
+    ],
+    "rank": 879
+  },
+  {
+    "url": "https://openreview.net/forum?id=muu0gF6BW-",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      3,
+      5,
+      3
+    ],
+    "abstract": "The marriage of recurrent neural networks and neural ordinary differential networks (ODE-RNN) is effective in modeling irregularly sampled sequences.\nWhile ODE produces the smooth hidden states between observation intervals, the RNN will trigger a hidden state jump when a new observation arrives and thus cause the interpolation discontinuity problem.\nTo address this issue, we propose the cubic spline smoothing compensation, which is a stand-alone module upon either the output or the hidden state of ODE-RNN and can be trained end-to-end.\nWe derive its analytical solution and provide its theoretical interpolation error bound.\nExtensive experiments indicate its merits over both ODE-RNN and cubic spline interpolation.",
+    "title": "Cubic Spline Smoothing Compensation for Irregularly Sampled Sequences",
+    "authors": [
+      "Jing Shi",
+      "Jing Bi",
+      "Yingru Liu",
+      "Chenliang Xu"
+    ],
+    "emails": [
+      "~Jing_Shi1",
+      "~Jing_Bi1",
+      "~Yingru_Liu1",
+      "~Chenliang_Xu1"
+    ],
+    "rank": 880
+  },
+  {
+    "url": "https://openreview.net/forum?id=KTlJT1nof6d",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Factorized layers\u2014operations parameterized by products of two or more matrices\u2014occur in a variety of deep learning contexts, including compressed model training, certain types of knowledge distillation, and multi-head self-attention architectures. We study how to initialize and regularize deep nets containing such layers, examining two simple, understudied schemes, spectral initialization and Frobenius decay, for improving their performance. The guiding insight is to design optimization routines for these networks that are as close as possible to that of their well-tuned, non-decomposed counterparts; we back this intuition with an analysis of how the initialization and regularization schemes impact training with gradient descent, drawing on modern attempts to understand the interplay of weight-decay and batch-normalization. Empirically, we highlight the benefits of spectral initialization and Frobenius decay across a variety of settings. In model compression, we show that they enable low-rank methods to significantly outperform both unstructured sparsity and tensor methods on the task of training low-memory residual networks; analogs of the schemes also improve the performance of tensor decomposition techniques. For knowledge distillation, Frobenius decay enables a simple, overcomplete baseline that yields a compact model from over-parameterized training without requiring retraining with or pruning a teacher network. Finally, we show how both schemes applied to multi-head attention lead to improved performance on both translation and unsupervised pre-training.",
+    "title": "Initialization and Regularization of Factorized Neural Layers",
+    "authors": [
+      "Mikhail Khodak",
+      "Neil A. Tenenholtz",
+      "Lester Mackey",
+      "Nicolo Fusi"
+    ],
+    "emails": [
+      "~Lester_Mackey1",
+      "~Mikhail_Khodak1",
+      "~Nicolo_Fusi1",
+      "~Neil_A._Tenenholtz1"
+    ],
+    "rank": 881
+  },
+  {
+    "url": "https://openreview.net/forum?id=8nXkyH2_s6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "The input space of a neural network with ReLU-like activations is partitioned into multiple linear regions, each corresponding to a specific activation pattern of the included ReLU-like activations. We demonstrate that this partition exhibits the following encoding properties across a variety of deep learning models: (1) {\\it determinism}: almost every linear region contains at most one training example. We can therefore represent almost every training example by a unique activation pattern, which is parameterized by a {\\it neural code}; and (2) {\\it categorization}: according to the neural code, simple algorithms, such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>-Means, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>-NN, and logistic regression, can achieve fairly good performance on both training and test data. These encoding properties surprisingly suggest that {\\it normal neural networks well-trained for classification behave as hash encoders without any extra efforts.} In addition, the encoding properties exhibit variability in different scenarios. {Further experiments demonstrate that {\\it model size}, {\\it training time}, {\\it training sample size}, {\\it regularization}, and {\\it label noise} contribute in shaping the encoding properties, while the impacts of the first three are dominant.} We then define an {\\it activation hash phase chart} to represent the space expanded by {model size}, training time, training sample size, and the encoding properties, which is divided into three canonical regions: {\\it under-expressive regime}, {\\it critically-expressive regime}, and {\\it sufficiently-expressive regime}.",
+    "title": "Neural networks behave as hash encoders: An empirical study",
+    "authors": [
+      "Fengxiang He",
+      "Shiye Lei",
+      "Jianmin Ji",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Dacheng_Tao1",
+      "~Fengxiang_He1",
+      "gmail.com",
+      "ustc.edu.cn"
+    ],
+    "rank": 882
+  },
+  {
+    "url": "https://openreview.net/forum?id=dV19Yyi1fS3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      4,
+      6,
+      10,
+      4
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We tackle the problem of producing compact models, maximizing their accuracy for a given model size. A standard solution is to train networks with Quantization Aware Training, where the weights are quantized during training and the gradients approximated with the Straight-Through Estimator. In this paper, we extend this approach to work with extreme compression methods where the approximations introduced by STE are severe. Our proposal is to only quantize a different random subset of weights during each forward, allowing for unbiased gradients to flow through the other weights. Controlling the amount of noise and its form allows for extreme compression rates while maintaining the performance of the original model. As a result we establish new state-of-the-art compromises between accuracy and model size both in natural language processing and image classification. For example, applying our method to state-of-the-art Transformer and ConvNet architectures, we can achieve 82.5% accuracy on MNLI by compressing RoBERTa to 14 MB and 80.0% top-1 accuracy on ImageNet by compressing an EfficientNet-B3 to 3.3 MB.",
+    "title": "Training with Quantization Noise for Extreme Model Compression",
+    "authors": [
+      "Pierre Stock",
+      "Angela Fan",
+      "Benjamin Graham",
+      "Edouard Grave",
+      "R\u00e9mi Gribonval",
+      "Herve Jegou",
+      "Armand Joulin"
+    ],
+    "emails": [
+      "~Angela_Fan2",
+      "~Armand_Joulin1",
+      "~Benjamin_Graham1",
+      "~Pierre_Stock1",
+      "~Edouard_Grave1",
+      "~Herve_Jegou1",
+      "~R\u00e9mi_Gribonval1"
+    ],
+    "rank": 883
+  },
+  {
+    "url": "https://openreview.net/forum?id=MY5iHZ0IZXl",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "While \u201cattention is all you need\u201d may be proving true, we do not yet know why: attention-based transformer models such as BERT are superior but how they contextualize information even for simple grammatical rules such as subject-verb number agreement(SVA) is uncertain. We introduce multi-partite patterns, abstractions of sets of paths through a neural network model. Patterns quantify and localize the effect of an input concept (e.g., a subject\u2019s number) on an output concept (e.g. corresponding verb\u2019s number) to paths passing through a sequence of model components, thus surfacing how BERT contextualizes information. We describe guided pattern refinement, an efficient search procedure for finding sufficient and sparse patterns representative of concept-critical paths. We discover that patterns generate succinct and meaningful explanations for BERT, highlighted by \u201ccopy\u201d and \u201ctransfer\u201d operations implemented by skip connections and attention heads, respectively. We also show how pattern visualizations help us understand how BERT contextualizes various grammatical concepts, such as SVA across clauses, and why it makes errors in some cases while succeeding in others.",
+    "title": "ABSTRACTING INFLUENCE PATHS  FOR EXPLAINING (CONTEXTUALIZATION OF) BERT MODELS",
+    "authors": [
+      "Kaiji Lu",
+      "Zifan Wang",
+      "Piotr Mardziel",
+      "Anupam Datta"
+    ],
+    "emails": [
+      "~Kaiji_Lu1",
+      "~Zifan_Wang1",
+      "gmail.com",
+      "~Anupam_Datta2"
+    ],
+    "rank": 884
+  },
+  {
+    "url": "https://openreview.net/forum?id=CYO5T-YjWZV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph Convolutional Networks (GCNs) are leading methods for learning graph representations. However, without specially designed architectures, the performance of GCNs degrades quickly with increased depth. As the aggregated neighborhood size and neural network depth are two completely orthogonal aspects of graph representation, several methods focus on summarizing the neighborhood by aggregating K-hop neighborhoods of nodes while using shallow neural networks. However, these methods still encounter oversmoothing, and suffer from high computation and storage costs. In this paper, we use a modified Markov Diffusion Kernel to derive a variant of GCN called Simple Spectral Graph Convolution (SSGC). Our spectral analysis shows that our simple spectral graph convolution used in SSGC is a trade-off of low- and high-pass filter bands which capture the global and local contexts of each node. We provide two theoretical claims which demonstrate that we can aggregate over a sequence of increasingly larger neighborhoods compared to competitors while limiting severe oversmoothing.  Our experimental evaluations show that SSGC with a linear learner is competitive in text and node classification tasks. Moreover, SSGC is comparable to other state-of-the-art methods for node clustering and community prediction tasks.",
+    "title": "Simple Spectral Graph Convolution",
+    "authors": [
+      "Hao Zhu",
+      "Piotr Koniusz"
+    ],
+    "emails": [
+      "~Piotr_Koniusz1",
+      "~Hao_Zhu2"
+    ],
+    "rank": 885
+  },
+  {
+    "url": "https://openreview.net/forum?id=7JSTDTZtn7-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "In Byzantine-robust distributed optimization, a central server wants to train a machine learning model over data distributed across multiple workers. However, a fraction of these workers may deviate from the prescribed algorithm and send arbitrary messages to the server. While this problem has received significant attention recently, most current defenses assume that the workers have identical data distribution. For realistic cases when the data across workers are heterogeneous (non-iid), we design new attacks that circumvent these defenses leading to significant loss of performance. We then propose a universal resampling scheme that addresses data heterogeneity at a negligible computational cost. We theoretically and experimentally validate our approach, showing that combining resampling with existing robust algorithms is effective against challenging attacks.\n",
+    "title": "Byzantine-Robust Learning on Heterogeneous Datasets via Resampling",
+    "authors": [
+      "Lie He",
+      "Sai Praneeth Karimireddy",
+      "Martin Jaggi"
+    ],
+    "emails": [
+      "~Sai_Praneeth_Karimireddy1",
+      "~Martin_Jaggi1",
+      "~Lie_He1"
+    ],
+    "rank": 886
+  },
+  {
+    "url": "https://openreview.net/forum?id=bMCfFepJXM",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      5,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Online interactions with the environment to collect data samples for training a Reinforcement Learning agent is not always feasible due to economic and safety concerns. The goal of Offline Reinforcement Learning (RL) is to address this problem by learning effective policies using previously collected datasets. Standard off-policy RL algorithms are prone to overestimations of the values of out-of-distribution (less explored) actions and are hence unsuitable for Offline RL. Behavior regularization, which constraints the learned policy within the support set of the dataset, has been proposed to tackle the limitations of standard off-policy algorithms. In this paper, we improve the behavior regularized offline reinforcement learning and propose \\emph{BRAC+}. We use an analytical upper bound on KL divergence as the behavior regularizor to reduce variance associated with sample based estimations. Additionally, we employ state-dependent Lagrange multipliers for the regularization term to avoid distributing KL divergence penalty across all states of the sampled batch. The proposed Lagrange multipliers allow more freedom of deviation to high probability (more explored) states leading to better rewards while simultaneously restricting low probability (less explored) states to prevent out-of-distribution actions. \\textcolor{blue}{To prevent catastrophic performance degradation due to rare out-of-distribution actions, we add a gradient penalty term to the policy evaluation objective to penalize the gradient of the Q value w.r.t the out-of-distribution actions. By doing so, the Q values evaluated at the out-of-distribution actions are bounded.} On challenging offline RL benchmarks, BRAC+ outperforms the state-of-the-art model-free and model-based approaches.",
+    "title": "BRAC+: Going Deeper with Behavior Regularized Offline Reinforcement Learning",
+    "authors": [
+      "Chi Zhang",
+      "Sanmukh Rao Kuppannagari",
+      "Viktor Prasanna"
+    ],
+    "emails": [
+      "~Chi_Zhang16",
+      "~Sanmukh_Rao_Kuppannagari1",
+      "~Viktor_Prasanna1"
+    ],
+    "rank": 887
+  },
+  {
+    "url": "https://openreview.net/forum?id=xyGFYKIPTDJ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Conventional supervised learning methods, especially deep ones, are found to be sensitive to out-of-distribution (OOD) examples, largely because the learned representation mixes the semantic factor with the variation factor due to their domain-specific correlation, while only the semantic factor causes the output. To address the problem, we propose a Causal Semantic Generative model (CSG) based on causality to model the two factors separately, and learn it on a single training domain for prediction without (OOD generalization) or with unsupervised data (domain adaptation) in a test domain. We prove that CSG identifies the semantic factor on the training domain, and the invariance principle of causality subsequently guarantees the boundedness of OOD generalization error and the success of adaptation. We also design novel and delicate learning methods for both effective learning and easy prediction, following the first principle of variational Bayes and the graphical structure of CSG. Empirical study demonstrates the effect of our methods to improve test accuracy for OOD generalization and domain adaptation.",
+    "title": "Learning Causal Semantic Representation for Out-of-Distribution Prediction",
+    "authors": [
+      "Chang Liu",
+      "Xinwei Sun",
+      "Jindong Wang",
+      "Tao Li",
+      "Tao Qin",
+      "Wei Chen",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Tao_Li9",
+      "~Tie-Yan_Liu1",
+      "~Chang_Liu10",
+      "~Wei_Chen1",
+      "~Xinwei_Sun1",
+      "~Jindong_Wang1"
+    ],
+    "rank": 888
+  },
+  {
+    "url": "https://openreview.net/forum?id=IkYEJ5Cps5H",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      3,
+      2,
+      4
+    ],
+    "abstract": "    Reducing the heavy computational cost of large convolutional neural networks is crucial when deploying the networks to resource-constrained environments. In this context, recent works propose channel pruning via greedy channel selection to achieve practical acceleration and memory footprint reduction. We first show this channel-wise approach ignores the inherent quadratic coupling between channels in the neighboring layers and cannot safely remove inactive weights during the pruning procedure. Furthermore, we show that these pruning methods cannot guarantee the given resource constraints are satisfied and cause discrepancy with the true objective. To this end, we formulate a principled optimization framework with discrete variable QCQP,  which provably prevents any inactive weights and enables the exact guarantee of meeting the resource constraints in terms of FLOPs and memory. Also, we extend the pruning granularity beyond channels and jointly prune individual 2D convolution filters spatially for greater efficiency. Our experiments show competitive pruning results under the target resource constraints on CIFAR-10 and ImageNet datasets on various network architectures.\n",
+    "title": "Succinct Network Channel and Spatial Pruning via Discrete Variable QCQP",
+    "authors": [
+      "Yeonwoo Jeong",
+      "Deokjae Lee",
+      "Gaon An",
+      "Changyong Son",
+      "Hyun Oh Song"
+    ],
+    "emails": [
+      "~Yeonwoo_Jeong1",
+      "~Changyong_Son1",
+      "~Deokjae_Lee1",
+      "~Gaon_An1",
+      "~Hyun_Oh_Song1"
+    ],
+    "rank": 889
+  },
+  {
+    "url": "https://openreview.net/forum?id=9uvhpyQwzM_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Explaining the predictions made by complex machine learning models helps users to understand and accept the predicted outputs with confidence. One promising way is to use similarity-based explanation that provides similar instances as evidence to support model predictions. Several relevance metrics are used for this purpose. In this study, we investigated relevance metrics that can provide reasonable explanations to users. Specifically, we adopted three tests to evaluate whether the relevance metrics satisfy the minimal requirements for similarity-based explanation. Our experiments revealed that the cosine similarity of the gradients of the loss performs best, which would be a recommended choice in practice. In addition, we showed that some metrics perform poorly in our tests and analyzed the reasons of their failure. We expect our insights to help practitioners in selecting appropriate relevance metrics and also aid further researches for designing better relevance metrics for explanations.",
+    "title": "Evaluation of Similarity-based Explanations",
+    "authors": [
+      "Kazuaki Hanawa",
+      "Sho Yokoi",
+      "Satoshi Hara",
+      "Kentaro Inui"
+    ],
+    "emails": [
+      "~Kazuaki_Hanawa1",
+      "~Sho_Yokoi1",
+      "~Satoshi_Hara1",
+      "~Kentaro_Inui1"
+    ],
+    "rank": 890
+  },
+  {
+    "url": "https://openreview.net/forum?id=xgGS6PmzNq6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Disparate impact has raised serious concerns in machine learning applications and its societal impacts. In response to the need of mitigating discrimination, fairness has been regarded as a crucial property in algorithmic design. In this work, we study the problem of disparate impact on graph-structured data. Specifically, we focus on dyadic fairness, which articulates a fairness concept that a predictive relationship between two instances should be independent of the sensitive attributes. Based on this, we theoretically relate the graph connections to dyadic fairness on link predictive scores in learning graph neural networks, and reveal that regulating weights on existing edges in a graph contributes to dyadic fairness conditionally. Subsequently, we propose our algorithm, \\textbf{FairAdj}, to empirically learn a fair adjacency matrix with proper graph structural constraints for fair link prediction, and in the meanwhile preserve predictive accuracy as much as possible. Empirical validation demonstrates that our method delivers effective dyadic fairness in terms of various statistics, and at the same time enjoys a favorable fairness-utility tradeoff.",
+    "title": "On Dyadic Fairness: Exploring and Mitigating Bias in Graph Connections",
+    "authors": [
+      "Peizhao Li",
+      "Yifei Wang",
+      "Han Zhao",
+      "Pengyu Hong",
+      "Hongfu Liu"
+    ],
+    "emails": [
+      "~Hongfu_Liu2",
+      "~Han_Zhao1",
+      "~Pengyu_Hong1",
+      "~Yifei_Wang3",
+      "~Peizhao_Li1"
+    ],
+    "rank": 891
+  },
+  {
+    "url": "https://openreview.net/forum?id=7hMenh--8g",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      7,
+      8,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Offline Reinforcement Learning promises to learn effective policies from previously-collected, static datasets without the need for exploration. However, existing Q-learning and actor-critic based off-policy RL algorithms fail when bootstrapping from out-of-distribution (OOD) actions or states. We hypothesize that a key missing ingredient from the existing methods is a proper treatment of uncertainty in the offline setting. We propose Uncertainty Weighted Actor-Critic (UWAC), an algorithm that models the epistemic uncertainty to detect OOD state-action pairs and down-weights their contribution in the training objectives accordingly. Implementation-wise, we adopt a practical and effective dropout-based uncertainty estimation method that introduces very little overhead over existing RL algorithms. Empirically, we observe that UWAC substantially improves model stability during training. In addition, UWAC out-performs existing offline RL methods on a variety of competitive tasks, and achieves significant performance gains over the state-of-the-art baseline on datasets with sparse demonstrations collected from human experts.",
+    "title": "Uncertainty Weighted Offline Reinforcement Learning",
+    "authors": [
+      "Yue Wu",
+      "Shuangfei Zhai",
+      "Nitish Srivastava",
+      "Joshua M. Susskind",
+      "Jian Zhang",
+      "Ruslan Salakhutdinov",
+      "Hanlin Goh"
+    ],
+    "emails": [
+      "~Yue_Wu17",
+      "~Jian_Zhang23",
+      "~Shuangfei_Zhai3",
+      "~Joshua_M._Susskind1",
+      "~Ruslan_Salakhutdinov1",
+      "~Hanlin_Goh2",
+      "~Nitish_Srivastava1"
+    ],
+    "rank": 892
+  },
+  {
+    "url": "https://openreview.net/forum?id=V6BjBgku7Ro",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Learning dynamics models in high-dimensional observation spaces can be challenging for model-based RL agents. We propose a novel way to learn models in a latent space by learning to predict sequences of future actions conditioned on task completion. These models track task-relevant environment dynamics over a distribution of tasks, while simultaneously serving as an effective heuristic for planning with sparse rewards. We evaluate our method on challenging visual goal completion tasks and show a substantial increase in performance compared to prior model-free approaches.",
+    "title": "Planning from Pixels using Inverse Dynamics Models",
+    "authors": [
+      "Keiran Paster",
+      "Sheila A. McIlraith",
+      "Jimmy Ba"
+    ],
+    "emails": [
+      "~Keiran_Paster1",
+      "~Jimmy_Ba1",
+      "~Sheila_A._McIlraith1"
+    ],
+    "rank": 893
+  },
+  {
+    "url": "https://openreview.net/forum?id=g21u6nlbPzn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      1
+    ],
+    "abstract": "Performing inference on deep learning models for videos remains a challenge due to the large amount of computational resources required to achieve robust recognition. An inherent property of real-world videos is the high correlation of information across frames which can translate into redundancy in either temporal or spatial feature maps of the models, or both. The type of redundant features depends on the dynamics and type of events in the video: static videos have more temporal redundancy while videos focusing on objects tend to have more channel redundancy. Here we present a redundancy reduction framework, termed VA-RED<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>, which is input-dependent. Specifically, our VA-RED<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> framework uses an input-dependent policy to decide how many features need to be computed for temporal and channel dimensions. To keep the capacity of the original model, after fully computing the necessary features, we reconstruct the remaining redundant features from those using cheap linear operations. We learn the adaptive policy jointly with the network weights in a differentiable way with a shared-weight mechanism, making it highly efficient. Extensive experiments on multiple video datasets and different visual tasks show that our framework achieves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>20</mn><mi mathvariant=\"normal\">%</mi><mo>\u2212</mo><mn>40</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> reduction in computation (FLOPs) when compared to state-of-the-art methods without any performance loss. Project page: <a href=\"http://people.csail.mit.edu/bpan/va-red/\" target=\"_blank\" rel=\"nofollow\">http://people.csail.mit.edu/bpan/va-red/</a>.",
+    "title": "VA-RED2: Video Adaptive Redundancy Reduction",
+    "authors": [
+      "Bowen Pan",
+      "Rameswar Panda",
+      "Camilo Luciano Fosco",
+      "Chung-Ching Lin",
+      "Alex J Andonian",
+      "Yue Meng",
+      "Kate Saenko",
+      "Aude Oliva",
+      "Rogerio Feris"
+    ],
+    "emails": [
+      "~Alex_J_Andonian1",
+      "~Rameswar_Panda1",
+      "~Camilo_Luciano_Fosco1",
+      "~Bowen_Pan2",
+      "~Chung-Ching_Lin2",
+      "~Yue_Meng1",
+      "~Kate_Saenko1",
+      "~Rogerio_Feris1",
+      "~Aude_Oliva1"
+    ],
+    "rank": 894
+  },
+  {
+    "url": "https://openreview.net/forum?id=1-Mh-cWROZ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Designing novel protein sequences consistent with a desired 3D structure  or fold, often referred to as the inverse protein folding problem, is a central, but non-trivial, task in protein engineering.  It has a wide range of applications  in energy, biomedicine, and materials science. However, challenges exist due to the complex sequence-fold relationship and difficulties associated with  modeling 3D folds. To overcome these challenges, we propose Fold2Seq, a novel transformer-based generative framework for designing protein sequences conditioned on a specific fold. Our model learns a fold  embedding from the density of the secondary structural elements in 3D voxels, and then models  the complex sequence-structure relationship by learning a joint sequence-fold embedding. Experiments on  high-resolution, complete, and single-structure test set demonstrate improved  performance of Fold2Seq in terms of speed and reliability for sequence design, compared to existing baselines including the state-of-the-art RosettaDesign and other neural net-based approaches. The unique advantages of fold-based Fold2Seq becomes more evident on diverse real-world test sets comprised of low-resolution, incomplete, or ensemble structures,  in comparison to a structure-based model. ",
+    "title": "Fold2Seq: A Joint Sequence(1D)-Fold(3D) Embedding-based Generative Model for Protein Design",
+    "authors": [
+      "Yue Cao",
+      "Payel Das",
+      "Pin-Yu Chen",
+      "Vijil Chenthamarakshan",
+      "Igor Melnyk",
+      "Yang Shen"
+    ],
+    "emails": [
+      "~Pin-Yu_Chen1",
+      "~Igor_Melnyk1",
+      "~Payel_Das1",
+      "~Yue_Cao4",
+      "~Yang_Shen4",
+      "~Vijil_Chenthamarakshan1"
+    ],
+    "rank": 895
+  },
+  {
+    "url": "https://openreview.net/forum?id=MjvduJCsE4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Modern deep learning models have achieved great success in predictive accuracy for many data modalities. However, their application to many real-world tasks is restricted by poor uncertainty estimates, such as overconfidence on out-of-distribution (OOD) data and ungraceful failing under distributional shift. Previous benchmarks have found that ensembles of neural networks (NNs) are typically the best calibrated models on OOD data. Inspired by this, we leverage recent theoretical advances that characterize the function-space prior of an infinitely-wide NN as a Gaussian process, termed the neural network Gaussian process (NNGP). We use the NNGP with a softmax link function to build a probabilistic model for multi-class classification and marginalize over the latent Gaussian outputs to sample from the posterior. This gives us a better understanding of the implicit prior NNs place on function space and allows a direct comparison of the calibration of the NNGP and its finite-width analogue. We also examine the calibration of previous approaches to classification with the NNGP, which treat classification problems as regression to the one-hot labels. In this case the Bayesian posterior is exact, and we compare several heuristics to generate a categorical distribution over classes. We find these methods are well calibrated under distributional shift. Finally, we consider an infinite-width final layer in conjunction with a pre-trained embedding. This replicates the important practical use case of transfer learning and allows scaling to significantly larger datasets. As well as achieving competitive predictive accuracy, this approach is better calibrated than its finite width analogue.",
+    "title": "Exploring the Uncertainty Properties of Neural Networks\u2019 Implicit Priors in the Infinite-Width Limit",
+    "authors": [
+      "Ben Adlam",
+      "Jaehoon Lee",
+      "Lechao Xiao",
+      "Jeffrey Pennington",
+      "Jasper Snoek"
+    ],
+    "emails": [
+      "~Jaehoon_Lee2",
+      "~Jasper_Snoek1",
+      "~Lechao_Xiao2",
+      "~Jeffrey_Pennington1",
+      "~Ben_Adlam1"
+    ],
+    "rank": 896
+  },
+  {
+    "url": "https://openreview.net/forum?id=ECuvULjFQia",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      3,
+      2,
+      4
+    ],
+    "abstract": "By learning to predict trajectories of dynamical systems, model-based methods can make extensive use of all observations from past experience. However, due to partial observability, stochasticity, compounding errors, and irrelevant dynamics, training to predict observations explicitly often results in poor models. Model-free techniques try to side-step the problem by learning to predict values directly. While breaking the explicit dependency on future observations can result in strong performance, this usually comes at the cost of low sample efficiency, as the abundant information about the dynamics contained in future observations goes unused. Here we take a step back from both approaches: Instead of hand-designing how trajectories should be incorporated, a teacher network learns to interpret the trajectories and to provide target activations which guide a student model that can only observe the present. The teacher is trained with meta-gradients to maximize the student's performance on a validation set. We show that our approach performs well on tasks that are difficult for model-free and model-based methods, and we study the role of every component through ablation studies.",
+    "title": "A teacher-student framework to distill future trajectories",
+    "authors": [
+      "Alexander Neitz",
+      "Giambattista Parascandolo",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Giambattista_Parascandolo1",
+      "~Alexander_Neitz1",
+      "~Bernhard_Sch\u00f6lkopf1"
+    ],
+    "rank": 897
+  },
+  {
+    "url": "https://openreview.net/forum?id=5lhWG3Hj2By",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "When designing controllers for safety-critical systems, practitioners often face a challenging tradeoff between robustness and performance. While robust control methods provide rigorous guarantees on system stability under certain worst-case disturbances, they often yield simple controllers that perform poorly in the average (non-worst) case. In contrast, nonlinear control methods trained using deep learning have achieved state-of-the-art performance on many control tasks, but often lack robustness guarantees. In this paper, we propose a technique that combines the strengths of these two approaches: constructing a generic nonlinear control policy class, parameterized by neural networks, that nonetheless enforces the same provable robustness criteria as robust control. Specifically, our approach entails integrating custom convex-optimization-based projection layers into a neural network-based policy. We demonstrate the power of this approach on several domains, improving in average-case performance over existing robust control methods and in worst-case stability over (non-robust) deep RL methods.",
+    "title": "Enforcing robust control guarantees within neural network policies",
+    "authors": [
+      "Priya L. Donti",
+      "Melrose Roderick",
+      "Mahyar Fazlyab",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "cmu.edu",
+      "seas.upenn.edu",
+      "~J_Zico_Kolter1",
+      "~Priya_L._Donti1"
+    ],
+    "rank": 898
+  },
+  {
+    "url": "https://openreview.net/forum?id=xnC8YwKUE3k",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      4,
+      8
+    ],
+    "rating": "6.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Time-series learning is the bread and butter of data-driven *clinical decision support*, and the recent explosion in ML research has demonstrated great potential in various healthcare settings. At the same time, medical time-series problems in the wild are challenging due to their highly *composite* nature: They entail design choices and interactions among components that preprocess data, impute missing values, select features, issue predictions, estimate uncertainty, and interpret models. Despite exponential growth in electronic patient data, there is a remarkable gap between the potential and realized utilization of ML for clinical research and decision support. In particular, orchestrating a real-world project lifecycle poses challenges in engineering (i.e. hard to build), evaluation (i.e. hard to assess), and efficiency (i.e. hard to optimize). Designed to address these issues simultaneously, Clairvoyance proposes a unified, end-to-end, autoML-friendly pipeline that serves as a (i) software toolkit, (ii) empirical standard, and (iii) interface for optimization. Our ultimate goal lies in facilitating transparent and reproducible experimentation with complex inference workflows, providing integrated pathways for (1) personalized prediction, (2) treatment-effect estimation, and (3) information acquisition. Through illustrative examples on real-world data in outpatient, general wards, and intensive-care settings, we illustrate the applicability of the pipeline paradigm on core tasks in the healthcare journey. To the best of our knowledge, Clairvoyance is the first to demonstrate viability of a comprehensive and automatable pipeline for clinical time-series ML.",
+    "title": "Clairvoyance: A Pipeline Toolkit for Medical Time Series",
+    "authors": [
+      "Daniel Jarrett",
+      "Jinsung Yoon",
+      "Ioana Bica",
+      "Zhaozhi Qian",
+      "Ari Ercole",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Ioana_Bica1",
+      "~Mihaela_van_der_Schaar2",
+      "~Daniel_Jarrett1",
+      "~Jinsung_Yoon1",
+      "cam.ac.uk",
+      "~Zhaozhi_Qian1"
+    ],
+    "rank": 899
+  },
+  {
+    "url": "https://openreview.net/forum?id=oj3bHNSq_2w",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Generative adversarial networks were introduced with a logistic MiniMax cost formulation, which normally fails to train due to saturation, and a Non-Saturating reformulation. While addressing the saturation problem, NS-GAN also inverts the generator's sample weighting, implicitly shifting emphasis from higher-scoring to lower-scoring samples when updating parameters. We present both theory and empirical results suggesting that this makes NS-GAN prone to mode dropping. We design MM-nsat, which preserves MM-GAN sample weighting while avoiding saturation by rescaling the MM-GAN minibatch gradient such that its magnitude approximates NS-GAN's gradient magnitude. MM-nsat has qualitatively different training dynamics, and on MNIST and CIFAR-10 it is stronger in terms of mode coverage, stability and FID. While the empirical results for MM-nsat are promising and favorable also in comparison with the LS-GAN and Hinge-GAN formulations, our main contribution is to show how and why NS-GAN's sample weighting causes mode dropping and training collapse.",
+    "title": "Sample weighting as an explanation for mode collapse in generative adversarial networks",
+    "authors": [
+      "Aksel Wilhelm Wold Eide",
+      "Eilif Solberg",
+      "Ingebj\u00f8rg K\u00e5sen"
+    ],
+    "emails": [
+      "~Aksel_Wilhelm_Wold_Eide1",
+      "ffi.no"
+    ],
+    "rank": 900
+  },
+  {
+    "url": "https://openreview.net/forum?id=0Su7gvitc1H",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Although the Bayesian paradigm provides a rigorous framework to estimate the full probability distribution over unknown parameters,  its online  implementation can be challenging due to heavy computational costs. This paper proposes Adaptive Recursive Markov Chain Monte Carlo (ARMCMC) which estimates full probability density of model parameters while alleviating  shortcomings of conventional online approaches. These shortcomings include: being solely able to account for Gaussian noise, being applicable to systems with linear in the parameters (LIP) constraint, or having requirements on persistence excitation (PE). In ARMCMC, we propose a variable jump distribution, which depends on a temporal forgetting factor.  This allows one to adjust the trade-off between exploitation and exploration, depending on whether there is an abrupt change to the parameter being estimated. We prove that ARMCMC requires fewer samples to achieve the same precision and reliability compared to conventional MCMC approaches.  We demonstrate our approach on two challenging benchmark:  the estimation of parameters in a soft bending actuator and the Hunt-Crossley dynamic model. Our method shows at-least 70% improvement in parameter point estimation accuracy and approximately 55% reduction in tracking error of the value of interest compared to recursive least squares and conventional MCMC.",
+    "title": "ARMCMC: Online Model Parameters full probability Estimation in Bayesian Paradigm",
+    "authors": [
+      "Pedram Agand",
+      "Mo Chen",
+      "Hamid D. Taghirad"
+    ],
+    "emails": [
+      "~Pedram_Agand1",
+      "kntu.ac.ir",
+      "~Mo_Chen1"
+    ],
+    "rank": 901
+  },
+  {
+    "url": "https://openreview.net/forum?id=dgd4EJqsbW5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "A major challenge in modern reinforcement learning (RL) is efficient control of dynamical systems from high-dimensional sensory observations.   Learning controllable embedding (LCE) is a promising approach that addresses this challenge by embedding the observations into a lower-dimensional latent space, estimating the latent dynamics, and utilizing it to perform control in the latent space.  Two important questions in this area are how to learn a representation that is amenable to the control problem at hand, and how to achieve an end-to-end framework for representation learning and control.  In this paper, we take a few steps towards addressing these questions. We first formulate a LCE model to learn representations that are suitable to be used by a policy iteration style algorithm in the latent space.We call this model control-aware representation learning(CARL). We derive a loss function and three implementations for CARL. In the offline implementation, we replace the locally-linear control algorithm (e.g., iLQR) used by the existing LCE methods with a RL algorithm, namely model-based soft actor-critic, and show that it results in significant improvement. In online CARL, we interleave representation learning and control, and demonstrate further gain in performance.  Finally, we propose value-guided CARL, a variation in which we optimize a weighted version of the CARL loss function, where the weights depend on the TD-error of the current policy. We evaluate the proposed algorithms by extensive experiments on benchmark tasks and compare them with several LCE baselines.",
+    "title": "Control-Aware Representations for Model-based Reinforcement Learning",
+    "authors": [
+      "Brandon Cui",
+      "Yinlam Chow",
+      "Mohammad Ghavamzadeh"
+    ],
+    "emails": [
+      "~Yinlam_Chow1",
+      "fb.com",
+      "~Mohammad_Ghavamzadeh2"
+    ],
+    "rank": 902
+  },
+  {
+    "url": "https://openreview.net/forum?id=kBVJ2NtiY-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      7,
+      5
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      2,
+      2,
+      4
+    ],
+    "abstract": "Since reward functions are hard to specify, recent work has focused on learning policies from human feedback. However, such approaches are impeded by the expense of acquiring such feedback. Recent work proposed that agents have access to a source of information that is effectively free: in any environment that humans have acted in, the state will already be optimized for human preferences, and thus an agent can extract information about what humans want from the state. Such learning is possible in principle, but requires simulating all possible past trajectories that could have led to the observed state. This is feasible in gridworlds, but how do we scale it to complex tasks? In this work, we show that by combining a learned feature encoder with learned inverse models, we can enable agents to simulate human actions backwards in time to infer what they must have done. The resulting algorithm is able to reproduce a specific skill in MuJoCo environments given a single state sampled from the optimal policy for that skill.",
+    "title": "Learning What To Do by Simulating the Past",
+    "authors": [
+      "David Lindner",
+      "Rohin Shah",
+      "Pieter Abbeel",
+      "Anca Dragan"
+    ],
+    "emails": [
+      "~Anca_Dragan1",
+      "~Pieter_Abbeel2",
+      "~Rohin_Shah1",
+      "~David_Lindner1"
+    ],
+    "rank": 903
+  },
+  {
+    "url": "https://openreview.net/forum?id=YMsbeG6FqBU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      5,
+      3,
+      3
+    ],
+    "abstract": "Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior: Advantage Regret-Matching Actor-Critic (ARMAC). Rather than saving past state-action data, ARMAC saves a buffer of past policies, replaying through them to reconstruct hindsight assessments of past behavior. These retrospective value estimates are used to predict conditional advantages which, combined with regret matching, produces a new policy. In particular, ARMAC learns from sampled trajectories in a centralized training setting, without requiring the application of importance sampling commonly used in Monte Carlo counterfactual regret (CFR) minimization; hence, it does not suffer from excessive variance in large environments. In the single-agent setting, ARMAC shows an interesting form of exploration by keeping past policies intact. In the multiagent setting, ARMAC in self-play approaches Nash equilibria on some partially-observable zero-sum benchmarks. We provide exploitability estimates in the significantly larger game of betting-abstracted no-limit Texas Hold'em.",
+    "title": "The Advantage Regret-Matching Actor-Critic",
+    "authors": [
+      "Audrunas Gruslys",
+      "Marc Lanctot",
+      "Remi Munos",
+      "Finbarr Timbers",
+      "Martin Schmid",
+      "Julien Perolet",
+      "Dustin Morrill",
+      "Vinicius Zambaldi",
+      "Jean-Baptiste Lespiau",
+      "John Schultz",
+      "Mohammad Gheshlaghi Azar",
+      "Michael Bowling",
+      "Karl Tuyls"
+    ],
+    "emails": [
+      "~Audrunas_Gruslys1",
+      "~Martin_Schmid2",
+      "~Vinicius_Zambaldi1",
+      "~Remi_Munos1",
+      "~Dustin_Morrill1",
+      "~Michael_Bowling1",
+      "~Finbarr_Timbers1",
+      "~John_Schultz1",
+      "~Karl_Tuyls1",
+      "~Marc_Lanctot1",
+      "~Jean-Baptiste_Lespiau1",
+      "~Mohammad_Gheshlaghi_Azar1",
+      "~Julien_Perolet1"
+    ],
+    "rank": 904
+  },
+  {
+    "url": "https://openreview.net/forum?id=DEa4JdMWRHp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      4,
+      6
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Exploratory analysis of time series data can yield a better understanding of complex dynamical systems. Granger causality is a practical framework for analysing interactions in sequential data, applied in a wide range of domains. In this paper, we propose a novel framework for inferring multivariate Granger causality under nonlinear dynamics based on an extension of self-explaining neural networks. This framework is more interpretable than other neural-network-based techniques for inferring Granger causality, since in addition to relational inference, it also allows detecting signs of Granger-causal effects and inspecting their variability over time. In comprehensive experiments on simulated data, we show that our framework performs on par with several powerful baseline methods at inferring Granger causality and that it achieves better performance at inferring interaction signs. The results suggest that our framework is a viable and more interpretable alternative to sparse-input neural networks for inferring Granger causality.",
+    "title": "Interpretable Models for Granger Causality Using Self-explaining Neural Networks",
+    "authors": [
+      "Ri\u010dards Marcinkevi\u010ds",
+      "Julia E Vogt"
+    ],
+    "emails": [
+      "~Julia_E_Vogt1",
+      "~Ri\u010dards_Marcinkevi\u010ds1"
+    ],
+    "rank": 905
+  },
+  {
+    "url": "https://openreview.net/forum?id=biH_IISPxYA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Partial label (PL) learning tackles the problem where each training instance is associated with a set of candidate labels that include both the true label and irrelevant noise labels. In this paper, we propose a novel multi-level generative model for partial label learning (MGPLL), which tackles the PL problem by learning both a label level adversarial generator and a feature level adversarial generator under a bi-directional mapping framework between the label vectors and the data samples. MGPLL uses a conditional noise label generation network to model the non-random noise labels and perform label denoising, and uses a multi-class predictor to map the training instances to the denoised label vectors, while a conditional data feature generator is used to form an inverse mapping from the denoised label vectors to data samples. Both the noise label generator and the data feature generator are learned in an adversarial manner to match the observed candidate labels and data features respectively. We conduct extensive experiments on both synthesized and real-world partial label datasets. The proposed approach demonstrates the state-of-the- art performance for partial label learning.",
+    "title": "Multi-Level Generative Models for Partial Label Learning with Non-random Label Noise",
+    "authors": [
+      "Yan Yan",
+      "Yuhong Guo"
+    ],
+    "emails": [
+      "~Yuhong_Guo1",
+      "~Yan_Yan10"
+    ],
+    "rank": 906
+  },
+  {
+    "url": "https://openreview.net/forum?id=ucEXZQncukK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural networks are known to suffer from catastrophic forgetting when trained on sequential datasets. While there have been numerous attempts to solve this problem for large-scale supervised classification, little has been done to overcome catastrophic forgetting for few-shot classification problems. Few-shot meta-learning algorithms often require all few-shot tasks to be readily available in a batch for training. The popular gradient-based model-agnostic meta-learning algorithm (MAML) is a typical algorithm that suffers from these limitations. This work introduces a Bayesian online meta-learning framework to tackle the catastrophic forgetting and the sequential few-shot tasks problems. Our framework incorporates MAML into a Bayesian online learning algorithm with Laplace approximation or variational inference.  This framework enables few-shot classification on a range of sequentially arriving datasets with a single meta-learned model and training on sequentially arriving few-shot tasks. The experimental evaluations demonstrate that our framework can effectively prevent catastrophic forgetting and is capable of online meta-learning in various few-shot classification settings.",
+    "title": "Bayesian Online Meta-Learning",
+    "authors": [
+      "Pauching Yap",
+      "Hippolyt Ritter",
+      "David Barber"
+    ],
+    "emails": [
+      "~Pau_Ching_Yap1",
+      "~David_Barber1",
+      "~Hippolyt_Ritter1"
+    ],
+    "rank": 907
+  },
+  {
+    "url": "https://openreview.net/forum?id=TETmEkko7e5",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      5,
+      7
+    ],
+    "rating": "6.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "As increasingly complex AI systems are introduced into our daily lives, it becomes important for such systems to be capable of explaining the rationale for their decisions and allowing users to contest these decisions. A significant hurdle to allowing for such explanatory dialogue could be the vocabulary mismatch between the user and the AI system. This paper introduces methods for providing contrastive explanations in terms of user-specified concepts for sequential decision-making settings where the system's model of the task may be best represented as a blackbox simulator. We do this by building partial symbolic models of a local approximation of the task that can be leveraged to answer the user queries. We empirically test these methods on a popular Atari game (Montezuma's Revenge) and modified versions of Sokoban (a well-known planning benchmark) and report the results of user studies to evaluate whether people find explanations generated in this form useful.",
+    "title": "Bridging the Gap: Providing Post-Hoc Symbolic Explanations for Sequential Decision-Making Problems with Inscrutable Representations",
+    "authors": [
+      "Sarath Sreedharan",
+      "Utkarsh Soni",
+      "Mudit Verma",
+      "Siddharth Srivastava",
+      "Subbarao Kambhampati"
+    ],
+    "emails": [
+      "~Siddharth_Srivastava2",
+      "~Utkarsh_Soni1",
+      "~Sarath_Sreedharan1",
+      "asu.edu",
+      "~Subbarao_Kambhampati1"
+    ],
+    "rank": 908
+  },
+  {
+    "url": "https://openreview.net/forum?id=J7bUsLCb0zf",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.94",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Recent advances in off-policy deep reinforcement learning (RL) have led to impressive success in complex tasks from visual observations. Experience replay improves sample-efficiency by reusing experiences from the past, and convolutional neural networks (CNNs) process high-dimensional inputs effectively. However, such techniques demand high memory and computational bandwidth. In this paper, we present Latent Vector Experience Replay (LeVER), a simple modification of existing off-policy RL methods, to address these computational and memory requirements without sacrificing the performance of RL agents. To reduce the computational overhead of gradient updates in CNNs, we freeze the lower layers of CNN encoders early in training due to early convergence of their parameters. Additionally, we reduce memory requirements by storing the low-dimensional latent vectors for experience replay instead of high-dimensional images, enabling an adaptive increase in the replay buffer capacity, a useful technique in constrained-memory settings. In our experiments, we show that LeVER does not degrade the performance of RL agents while significantly saving computation and memory across a diverse set of DeepMind Control environments and Atari games. Finally, we show that LeVER is useful for computation-efficient transfer learning in RL because lower layers of CNNs extract generalizable features, which can be used for different tasks and domains.",
+    "title": "Compute- and Memory-Efficient Reinforcement Learning with Latent Experience Replay",
+    "authors": [
+      "Lili Chen",
+      "Kimin Lee",
+      "Aravind Srinivas",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Aravind_Srinivas1",
+      "~Lili_Chen1",
+      "~Kimin_Lee1",
+      "~Pieter_Abbeel2"
+    ],
+    "rank": 909
+  },
+  {
+    "url": "https://openreview.net/forum?id=o20_NVA92tK",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      8
+    ],
+    "rating": "5.94",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We introduce three new robustness benchmarks consisting of naturally occurring distribution changes in image style, geographic location, camera operation, and more. Using our benchmarks, we take stock of previously proposed hypotheses for out-of-distribution robustness and put them to the test. We find that using larger models and synthetic data augmentation can improve robustness on real-world distribution shifts, contrary to claims in prior work. Motivated by this, we introduce a new data augmentation method which advances the state-of-the-art and outperforms models pretrained with 1000x more labeled data. We find that some methods consistently help with distribution shifts in texture and local image statistics, but these methods do not help with some other distribution shifts like geographic changes. Hence no evaluated method consistently improves robustness. We conclude that future research should study multiple distribution shifts simultaneously.",
+    "title": "A Critical Analysis of Distribution Shift",
+    "authors": [
+      "Dan Hendrycks",
+      "Steven Basart",
+      "Norman Mu",
+      "Saurav Kadavath",
+      "Frank Wang",
+      "Evan Dorundo",
+      "Rahul Desai",
+      "Tyler Zhu",
+      "Samyak Parajuli",
+      "Mike Guo",
+      "Dawn Song",
+      "Jacob Steinhardt",
+      "Justin Gilmer"
+    ],
+    "emails": [
+      "~Steven_Basart1",
+      "~Samyak_Parajuli1",
+      "~Dawn_Song1",
+      "~Jacob_Steinhardt1",
+      "~Dan_Hendrycks1",
+      "~Justin_Gilmer1",
+      "google.com",
+      "~Norman_Mu1",
+      "berkeley.edu",
+      "~Saurav_Kadavath1"
+    ],
+    "rank": 910
+  },
+  {
+    "url": "https://openreview.net/forum?id=GzHjhdpk-YH",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.94",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Comparing metric measure spaces (i.e. a metric space endowed with a probability distribution) is at the heart of many machine learning problems. This includes for instance predicting properties of molecules in quantum chemistry or generating graphs with varying connectivity. The most popular distance between such metric measure spaces is the Gromov-Wasserstein (GW) distance, which is the solution of a quadratic assignment problem. This distance has been successfully applied to supervised learning and generative modeling, for applications as diverse as quantum chemistry or natural language processing. The GW distance is however limited to the comparison of metric measure spaces endowed with a probability distribution. This strong limitation is problematic for many applications in ML where there is no a priori natural normalization on the total mass of the data. Furthermore, imposing an exact conservation of mass across spaces is not robust to outliers and often leads to irregular matching. To alleviate these issues, we introduce two Unbalanced Gromov-Wasserstein formulations: a distance and a more tractable upper-bounding relaxation. They both allow the comparison of metric spaces equipped with arbitrary positive measures up to isometries. The first formulation is a positive and definite divergence based on a relaxation of the mass conservation constraint using a novel type of quadratically-homogeneous divergence.This divergence works hand in hand with the entropic regularization approach which is popular to solve large scale optimal transport problems. We show that the underlying non-convex optimization problem can be efficiently tackled using a highly parallelizable and GPU-friendly iterative scheme. The second formulation is a distance between mm-spaces up to isometries based on a conic lifting. Lastly, we provide numerical simulations to highlight the salient features of the unbalanced divergence and its potential applications in ML.",
+    "title": "The Unbalanced Gromov Wasserstein Distance: Conic Formulation and Relaxation",
+    "authors": [
+      "Thibault Sejourne",
+      "Fran\u00e7ois-Xavier Vialard",
+      "Gabriel Peyr\u00e9"
+    ],
+    "emails": [
+      "~Fran\u00e7ois-Xavier_Vialard2",
+      "~Gabriel_Peyr\u00e92",
+      "~Thibault_Sejourne2"
+    ],
+    "rank": 911
+  },
+  {
+    "url": "https://openreview.net/forum?id=xjXg0bnoDmS",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.94",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "The properties of flat minima in the empirical risk landscape of neural networks have been debated for some time. Increasing evidence suggests they possess better generalization capabilities with respect to sharp ones. In this work we first discuss the relationship between alternative measures of flatness: The local entropy, which is useful for analysis and algorithm development, and the local energy, which is easier to compute and was shown empirically in extensive tests on state-of-the-art networks to be the best predictor of generalization capabilities. We show semi-analytically in simple controlled scenarios that these two measures correlate strongly with each other and with generalization. Then, we extend the analysis to the deep learning scenario by extensive numerical validations. We study two algorithms, Entropy-SGD and Replicated-SGD, that explicitly include the local entropy in the optimization objective. We devise a training schedule by which we consistently find flatter minima (using both flatness measures), and improve the generalization error for common architectures (e.g. ResNet, EfficientNet).",
+    "title": "Entropic gradient descent algorithms and wide flat minima",
+    "authors": [
+      "Fabrizio Pittorino",
+      "Carlo Lucibello",
+      "Christoph Feinauer",
+      "Gabriele Perugini",
+      "Carlo Baldassi",
+      "Elizaveta Demyanenko",
+      "Riccardo Zecchina"
+    ],
+    "emails": [
+      "~Christoph_Feinauer1",
+      "~Fabrizio_Pittorino1",
+      "~Carlo_Baldassi1",
+      "~Gabriele_Perugini1",
+      "phd.unibocconi.it",
+      "~Carlo_Lucibello1",
+      "~Riccardo_Zecchina1"
+    ],
+    "rank": 912
+  },
+  {
+    "url": "https://openreview.net/forum?id=H-BVtEaipej",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.94",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "This paper advocates incorporating a Low-Rank Global Attention (LRGA) module, a Computation and memory efficient variant of the dot-product attention (Vaswani et al., 2017), to Graph Neural Networks (GNNs) for improving their generalization power. \nTo theoretically quantify the generalization properties granted by adding the LRGA module to GNNs, we focus on a specific family of expressive GNNs and show that augmenting it with LRGA provides algorithmic alignment to a powerful graph isomorphism test, namely the 2-Folklore Weisfeiler-Lehman (2-FWL) algorithm. In more detail we: (i) consider the recent Random Graph Neural Network (RGNN) (Sato et al., 2020) framework and prove that it is universal in probability; (ii) show that RGNN augmented with LRGA aligns with 2-FWL update step via polynomial kernels; and (iii) bound the sample complexity of the kernel's feature map when learned with a randomly initialized two-layer MLP.\nFrom a practical point of view, augmenting existing GNN layers with LRGA produces state of the art results in current GNN benchmarks. Lastly, we observe that augmenting various GNN architectures with LRGA often closes the performance gap across different models.",
+    "title": "Global Attention Improves Graph Networks Generalization",
+    "authors": [
+      "Omri Puny",
+      "Heli Ben-Hamu",
+      "Yaron Lipman"
+    ],
+    "emails": [
+      "~Omri_Puny1",
+      "~Heli_Ben-Hamu1",
+      "~Yaron_Lipman1"
+    ],
+    "rank": 913
+  },
+  {
+    "url": "https://openreview.net/forum?id=DAaaaqPv9-q",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      8,
+      5
+    ],
+    "rating": "5.94",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper focuses on unsupervised/self-supervised whole-graph representation learning, which is critical in many tasks including drug and material discovery. Current methods can effectively model the local structure between different graph instances, but they fail to discover the global semantic structure of the entire dataset. In this work, we propose a unified framework called Local-instance and Global-semantic Learning (GraphLoG) for self-supervised whole-graph representation learning. Specifically, besides preserving the local instance-level structure, GraphLoG leverages a nonparametric strategy to learn hierarchical prototypes of the data. These prototypes capture the semantic clusters in the latent space, and the number of prototypes can automatically adapt to different feature distributions. We evaluate GraphLoG by pre-training it on massive unlabeled graphs followed by fine-tuning on downstream tasks. Extensive experiments on both chemical and biological benchmark datasets demonstrate the effectiveness of our approach. ",
+    "title": "Self-supervised Graph-level Representation Learning with Local and Global Structure",
+    "authors": [
+      "Minghao Xu",
+      "Hang Wang",
+      "Bingbing Ni",
+      "Hongyu Guo",
+      "Jian Tang"
+    ],
+    "emails": [
+      "~Hongyu_Guo1",
+      "~Hang_Wang1",
+      "~Minghao_Xu1",
+      "~Jian_Tang1",
+      "~Bingbing_Ni3"
+    ],
+    "rank": 914
+  },
+  {
+    "url": "https://openreview.net/forum?id=E6fb6ehhLh8",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      7
+    ],
+    "rating": "5.94",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We study the label shift problem in multi-source transfer learning and derive new generic principles. Our proposed framework unifies the principles of conditional feature alignment, label distribution ratio estimation, and domain relation weights estimation. Based on inspired practical principles, we provide a unified practical framework for three multi-source label shift transfer scenarios: learning with limited target data, unsupervised domain adaptation, and label partial unsupervised domain adaptation. We evaluate the proposed method on these scenarios by extensive experiments and show that our proposed algorithm can significantly outperform the baselines.   ",
+    "title": "Unified Principles For Multi-Source Transfer Learning Under Label Shifts",
+    "authors": [
+      "changjian shui",
+      "Zijian Li",
+      "jiaqi li",
+      "Christian Gagn\u00e9",
+      "Charles Ling",
+      "Boyu Wang"
+    ],
+    "emails": [
+      "~Boyu_Wang3",
+      "~changjian_shui1",
+      "~Zijian_Li1",
+      "gmail.com",
+      "~Charles_Ling1",
+      "~Christian_Gagn\u00e91"
+    ],
+    "rank": 915
+  },
+  {
+    "url": "https://openreview.net/forum?id=heqv8eIweMY",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.94",
+    "confidences": [
+      5,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Modern graph neural networks (GNNs) learn node embeddings through multilayer local aggregation and achieve great success in applications on assortative graphs. However, tasks on disassortative graphs usually require non-local aggregation. In addition, we find that local aggregation is even harmful for some disassortative graphs. In this work, we propose a simple yet effective non-local aggregation framework with an efficient attention-guided sorting for GNNs. Based on it, we develop various non-local GNNs. We perform thorough experiments to analyze disassortative graph datasets and evaluate our non-local GNNs. Experimental results demonstrate that our non-local GNNs significantly outperform previous state-of-the-art methods on six benchmark datasets of disassortative graphs, in terms of both model performance and efficiency.",
+    "title": "Non-Local Graph Neural Networks",
+    "authors": [
+      "Meng Liu",
+      "Zhengyang Wang",
+      "Shuiwang Ji"
+    ],
+    "emails": [
+      "~Meng_Liu3",
+      "~Zhengyang_Wang1",
+      "~Shuiwang_Ji1"
+    ],
+    "rank": 916
+  },
+  {
+    "url": "https://openreview.net/forum?id=4jXnFYaDOuD",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.94",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Integrating information from multiple modalities (e.g.,  verbal,  acoustic and visual data) into meaningful representations has seen great progress in recent years. However, two challenges are not sufficiently addressed by current approaches: (1) computationally efficient training of multimodal autoencoder networks which are robust in the absence of modalities, and (2) unsupervised learning of important subspaces in each modality which are correlated with other modalities.   In this paper we propose the IMA (Importance-based Multimodal Autoencoder) model, a scalable model that learns modality importances and robust multimodal representations through a novel cross-covariance based loss function.  We conduct experiments on MNIST-TIDIGITS a multimodal dataset of spoken and image digits,and on IEMOCAP, a multimodal emotion corpus. The IMA model is able to distinguish  digits  from  uncorrelated  noise,  and  word-level  importances  learnt  that correspond to the separation between function and emotional words.  The multimodal representations learnt by IMA are also competitive with state-of-the-art baseline approaches on downstream tasks.",
+    "title": "Importance-based Multimodal Autoencoder",
+    "authors": [
+      "Sayan Ghosh",
+      "Eugene Laksana",
+      "Louis-Philippe Morency",
+      "Stefan Scherer"
+    ],
+    "emails": [
+      "~Eugene_Laksana1",
+      "~Stefan_Scherer1",
+      "~Sayan_Ghosh1",
+      "~Louis-Philippe_Morency1"
+    ],
+    "rank": 917
+  },
+  {
+    "url": "https://openreview.net/forum?id=Svfh1_hYEtF",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.94",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "There has been a surge of interest in continual learning and federated learning, both of which are important in deep neural networks in real-world scenarios. Yet little research has been done regarding the scenario where each client learns on a sequence of tasks from a private local data stream.  This problem of federated continual learning poses new challenges to continual learning, such as utilizing knowledge from other clients, while preventing interference from irrelevant knowledge.  To resolve these issues, we propose a novel federated continual learning framework, Federated Weighted Inter-client Transfer (FedWeIT), which decomposes the network weights into global federated parameters and sparse task-specific parameters, and each client receives selective knowledge from other clients by taking a weighted combination of their task-specific parameters.FedWeITminimizes interference between incompatible tasks, and also allows positive knowledge transfer across clients during learning. We validate ourFedWeITagainst existing federated learning and continual learning methods under varying degrees of task similarity across clients, and our model significantly outperforms them with a large reduction in the communication cost.",
+    "title": "Federated Continual Learning with Weighted Inter-client Transfer",
+    "authors": [
+      "Jaehong Yoon",
+      "Wonyong Jeong",
+      "Giwoong Lee",
+      "Eunho Yang",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Giwoong_Lee1",
+      "~Eunho_Yang1",
+      "~Jaehong_Yoon1",
+      "~Wonyong_Jeong1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 918
+  },
+  {
+    "url": "https://openreview.net/forum?id=ipUPfYxWZvM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      7,
+      5
+    ],
+    "rating": "5.94",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "With sequentially stacked self-attention, (optional) encoder-decoder attention, and feed-forward layers, Transformer achieves big success in natural language processing (NLP), and many variants have been proposed. Currently, almost all these models assume that the \\emph{layer order} is fixed and kept the same across data samples. We observe that different data samples actually favor different orders of the layers. Based on this observation, in this work, we break the assumption of the fixed layer order in Transformer and introduce instance-wise layer reordering into model structure. Our Instance-wise Ordered Transformer (IOT) can model variant functions by reordered layers, which enables each sample to select the better one to improve the model performance under the constraint of almost same number of parameters. To achieve this, we introduce a light predictor with negligible parameter and inference cost to decide the most capable and favorable layer order for any input sequence. Experiments on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn></math></mjx-assistive-mml></mjx-container> tasks (neural machine translation, abstractive summarization, and code generation) and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>9</mn></math></mjx-assistive-mml></mjx-container> datasets demonstrate consistent improvements of our method. We further show that our method can also be applied to other architectures beyond Transformer. Our code is released at Github\\footnote{\\url{<a href=\"https://github.com/instance-wise-ordered-transformer/IOT}}\" target=\"_blank\" rel=\"nofollow\">https://github.com/instance-wise-ordered-transformer/IOT}}</a>.",
+    "title": "IOT: Instance-wise Layer Reordering for Transformer Structures",
+    "authors": [
+      "Jinhua Zhu",
+      "Lijun Wu",
+      "Yingce Xia",
+      "Shufang Xie",
+      "Tao Qin",
+      "Wengang Zhou",
+      "Houqiang Li",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Shufang_Xie1",
+      "~Tie-Yan_Liu1",
+      "~Lijun_Wu1",
+      "~Wengang_Zhou1",
+      "~Houqiang_Li1",
+      "~Jinhua_Zhu1",
+      "~Yingce_Xia1"
+    ],
+    "rank": 919
+  },
+  {
+    "url": "https://openreview.net/forum?id=EyDgK7q5vwJ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.94",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "We present a new deep neural network architecture, named EDGaM, for deep clustering. This architecture can seamlessly learn deep auto-encoders and capture common group features of complex inputs in the encoded latent space. The key idea is to introduce a differentiable Gaussian mixture neural network between an encoder and a decoder. In particular, EDGaM streamlines the iterative Expectation-Maximum (EM) algorithm of the Gaussian mixture models into network design and replaces the alternative update with a forward-backward optimization. Being differentiable, both network weights and clustering centroids in EDGaM can be learned simultaneously in an end-to-end manner through standard stochastic gradient descent. To avoid preserving too many sample-specific details, we use both the clustering centroid and the original latent embedding for decoding. Meanwhile, we distill the soft clustering assignment for each sample via entropy minimization such that a clear cluster structure is exhibited. Our experiments show that our method outperforms state-of-the-art unsupervised clustering techniques in terms of both efficiency and clustering performance. ",
+    "title": "Streamlining EM into Auto-Encoder Networks",
+    "authors": [
+      "Yuangang Pan",
+      "Ivor Tsang"
+    ],
+    "emails": [
+      "~Ivor_Tsang1",
+      "~Yuangang_Pan2"
+    ],
+    "rank": 920
+  },
+  {
+    "url": "https://openreview.net/forum?id=0F_OC_oROWb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      7,
+      8
+    ],
+    "rating": "5.94",
+    "confidences": [
+      2,
+      5,
+      4,
+      5
+    ],
+    "abstract": "We propose RSO (random search optimization), a gradient free, sampling based approach for training deep neural networks. To this end, RSO adds a perturbation to a weight in a deep neural network and tests if it reduces the loss on a mini-batch. If this reduces the loss, the weight is updated, otherwise the existing weight is retained. Surprisingly, we find that repeating this process a few times for each weight is sufficient to train a deep neural network. The number of weight updates for RSO is an order of magnitude lesser when compared to backpropagation with SGD. RSO can make aggressive weight updates in each step as there is no concept of learning rate. The weight update step for individual layers is also not coupled with the magnitude of the loss. RSO is evaluated on classification tasks on MNIST and CIFAR-10 datasets with deep neural networks of 6 to 10 layers where it achieves an accuracy of 99.1% and 81.8% respectively. We also find that after updating the weights just 5 times, the algorithm obtains a classification accuracy of 98% on MNIST.",
+    "title": "RSO: A Gradient Free Sampling Based Approach For Training Deep Neural Networks",
+    "authors": [
+      "Rohun Tripathi",
+      "Bharat Singh"
+    ],
+    "emails": [
+      "~Rohun_Tripathi1",
+      "~Bharat_Singh2"
+    ],
+    "rank": 921
+  },
+  {
+    "url": "https://openreview.net/forum?id=y4-e1K23GLC",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We initiate the study of the inherent tradeoffs between the size of a neural network and its robustness, as measured by its Lipschitz constant. We make a precise conjecture that, for any Lipschitz activation function and for most datasets, any two-layers neural network with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> neurons that perfectly fit the data must have its Lipschitz constant larger (up to a constant) than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c221A TEX-S1\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.107em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msqrt><mi>n</mi><mrow><mo>/</mo></mrow><mi>k</mi></msqrt></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the number of datapoints. In particular, this conjecture implies that overparametrization is necessary for robustness, since it means that one needs roughly one neuron per datapoint to ensure a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>-Lipschitz network, while mere data fitting of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container>-dimensional data requires only one neuron per <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> datapoints. We prove a weaker version of this conjecture when the Lipschitz constant is replaced by an upper bound on it based on the spectral norm of the weight matrix. We also prove the conjecture in the high-dimensional regime <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2248\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>\u2248</mo><mi>d</mi></math></mjx-assistive-mml></mjx-container> (which we also refer to as the undercomplete case, since only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2264\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi><mo>\u2264</mo><mi>d</mi></math></mjx-assistive-mml></mjx-container> is relevant here). Finally we prove the conjecture for polynomial activation functions of degree <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container> when <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2248\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>\u2248</mo><msup><mi>d</mi><mi>p</mi></msup></math></mjx-assistive-mml></mjx-container>. We complement these findings with experimental evidence supporting the conjecture.",
+    "title": "A law of robustness for two-layers neural networks",
+    "authors": [
+      "Sebastien Bubeck",
+      "Yuanzhi Li",
+      "Dheeraj Mysore Nagaraj"
+    ],
+    "emails": [
+      "~Yuanzhi_Li1",
+      "~Dheeraj_Mysore_Nagaraj1",
+      "~Sebastien_Bubeck1"
+    ],
+    "rank": 922
+  },
+  {
+    "url": "https://openreview.net/forum?id=kVZ6WBYazFq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Modern machine learning techniques have enjoyed widespread success, but are plagued by lack of transparency in their decision making, which has led to the emergence of the field of explainable AI.  One popular approach called LIME, seeks to explain an opaque model's behavior, by training a surrogate interpretable model to be locally faithful on perturbed instances. \nDespite being model-agnostic and easy-to-use, it is known that LIME's explanations can be unstable and are susceptible to adversarial attacks as a result of Out-Of-Distribution (OOD) sampling. Quality of explanations is also calculated heuristically, and lacks a strong theoretical foundation. In spite of numerous attempts to remedy some of these issues, making the LIME framework more trustworthy and reliable remains an open problem.\n\nIn this work, we demonstrate that the OOD sampling problem stems  from rigidity of the perturbation procedure.  To resolve this issue, we propose a theoretically sound framework based on uniform sampling of user-defined subspaces. Through logical constraints, we afford the end-user the flexibility to delineate the precise subspace of the input domain to be explained. This not only helps mitigate the problem of OOD sampling, but also allow experts to drill down and uncover bugs deep inside the model. For testing the quality of generated explanations, we develop an efficient estimation algorithm that is able to certifiably measure the true value of metrics such as fidelity up to any desired degree of accuracy, which can help in building trust in the generated explanations. Our framework called CLIME can be applied to any ML model, and extensive experiments demonstrate its versatility on real-world problems.\n",
+    "title": "Constraint-Driven Explanations of Black-Box ML Models",
+    "authors": [
+      "Aditya Aniruddha Shrotri",
+      "Nina Narodytska",
+      "Alexey Ignatiev",
+      "Joao Marques-Silva",
+      "Kuldeep S. Meel",
+      "Moshe Vardi"
+    ],
+    "emails": [
+      "~Moshe_Vardi1",
+      "~Alexey_Ignatiev1",
+      "~Nina_Narodytska1",
+      "~Joao_Marques-Silva1",
+      "~Aditya_Aniruddha_Shrotri1",
+      "~Kuldeep_S._Meel2"
+    ],
+    "rank": 923
+  },
+  {
+    "url": "https://openreview.net/forum?id=dx4b7lm8jMM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      4,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Sequential data such as time series, video, or text can be challenging to analyse as the ordered structure gives rise to complex dependencies. At the heart of this is non-commutativity, in the sense that reordering the elements of a sequence can completely change its meaning. We use a classical mathematical object -- the free algebra -- to capture this non-commutativity. To address the innate computational complexity of this algebra, we use compositions of low-rank tensor projections. This yields modular and scalable building blocks that give state-of-the-art performance on standard benchmarks such as multivariate time series classification, mortality prediction and generative models for video.",
+    "title": "Seq2Tens: An Efficient Representation of Sequences by Low-Rank Tensor Projections",
+    "authors": [
+      "Csaba Toth",
+      "Patric Bonnier",
+      "Harald Oberhauser"
+    ],
+    "emails": [
+      "~Csaba_Toth2",
+      "~Harald_Oberhauser1",
+      "~Patric_Bonnier1"
+    ],
+    "rank": 924
+  },
+  {
+    "url": "https://openreview.net/forum?id=nzKv5vxZfge",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "With the growing success of graph neural networks (GNNs), the explainability of GNN is attracting considerable attention. However, current works on feature attribution, which frame explanation generation as attributing a prediction to the graph features, mostly focus on the statistical interpretability. They may struggle to distinguish causal and noncausal effects of features, and quantify redundancy among features, thus resulting in unsatisfactory explanations. In this work, we focus on the causal interpretability in GNNs and propose a method, Causal Screening, from the perspective of cause-effect. It incrementally selects a graph feature (i.e., edge) with large causal attribution, which is formulated as the individual causal effect on the model outcome. As a model-agnostic tool, Causal Screening can be used to generate faithful and concise explanations for any GNN model. Further, by conducting extensive experiments on three graph classi\ufb01cation datasets, we observe that Causal Screening achieves signi\ufb01cant improvements over state-of-the-art approaches w.r.t. two quantitative metrics: predictive accuracy, contrastivity, and safely passes sanity checks.",
+    "title": "Causal Screening to Interpret Graph Neural Networks",
+    "authors": [
+      "Xiang Wang",
+      "Yingxin Wu",
+      "An Zhang",
+      "Xiangnan He",
+      "Tat-seng Chua"
+    ],
+    "emails": [
+      "mail.ustc.edu.cn",
+      "~Tat-seng_Chua1",
+      "~Xiangnan_He1",
+      "~Xiang_Wang6",
+      "~An_Zhang2"
+    ],
+    "rank": 925
+  },
+  {
+    "url": "https://openreview.net/forum?id=hLElJeJKxzY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      7
+    ],
+    "rating": "5.93",
+    "confidences": [
+      5,
+      4,
+      2,
+      4
+    ],
+    "abstract": "    Although Deep Reinforcement Learning (DRL) has proven its capability to learn optimal policies by directly interacting with simulation environments, how to combine DRL with supervised learning and leverage additional knowledge to assist the DRL agent effectively still remains difficult. This study proposes a novel approach integrating deep Q learning from dynamic demonstrations with a behavioral cloning model (DQfDD-BC), which includes a supervised learning technique of instructing a DRL model to enhance its performance. Specifically, the DQfDD-BC model leverages historical demonstrations to pre-train a supervised BC model and consistently update it by learning the dynamically updated demonstrations. Then the DQfDD-BC model manages the sample complexity by exploiting both the historical and generated demonstrations. An expert loss function is designed to compare actions generated by the DRL model with those obtained from the BC model to provide advantageous guidance for policy improvements. Experimental results in several OpenAI Gym environments show that the proposed approach adapts to different performance levels of demonstrations, and meanwhile, accelerates the learning processes. As illustrated in an ablation study, the dynamic demonstration and expert loss mechanisms with the utilization of a BC model contribute to improving the learning convergence performance compared with the origin DQfD model. ",
+    "title": "Deep Q Learning from Dynamic Demonstration with Behavioral Cloning",
+    "authors": [
+      "Xiaoshuang Li",
+      "Junchen Jin",
+      "Xiao Wang",
+      "Fei-Yue Wang"
+    ],
+    "emails": [
+      "ia.ac.cn",
+      "~Xiaoshuang_Li1",
+      "kth.se",
+      "~Fei-Yue_Wang2"
+    ],
+    "rank": 926
+  },
+  {
+    "url": "https://openreview.net/forum?id=QFYnKlBJYR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      6,
+      6,
+      3
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Action and observation delays commonly occur in many Reinforcement Learning applications, such as remote control scenarios. We study the anatomy of randomly delayed environments, and show that partially resampling trajectory fragments in hindsight allows for off-policy multi-step value estimation. We apply this principle to derive Delay-Correcting Actor-Critic (DCAC), an algorithm based on Soft Actor-Critic with significantly better performance in environments with delays. This is shown theoretically and also demonstrated practically on a delay-augmented version of the MuJoCo continuous control benchmark.",
+    "title": "Reinforcement Learning with Random Delays",
+    "authors": [
+      "Yann Bouteiller",
+      "Simon Ramstedt",
+      "Giovanni Beltrame",
+      "Christopher Pal",
+      "Jonathan Binas"
+    ],
+    "emails": [
+      "~Jonathan_Binas1",
+      "~Simon_Ramstedt1",
+      "polymtl.ca",
+      "~Christopher_Pal1",
+      "~Yann_Bouteiller1"
+    ],
+    "rank": 927
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZW0yXJyNmoG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.93",
+    "confidences": [
+      2,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Generative Adversarial Networks are notoriously challenging to train. The underlying minmax optimization is highly susceptible to the variance of the stochastic gradient and the rotational component of the associated game vector field. To tackle these challenges, we propose the Lookahead algorithm for minmax optimization, originally developed for single objective minimization only. The backtracking step of our Lookahead\u2013minmax naturally handles the rotational game dynamics, a property which was identified to be key for enabling gradient ascent descent methods to converge on challenging examples often analyzed in the literature. Moreover, it implicitly handles high variance without using large mini-batches, known to be essential for reaching state of the art performance. Experimental results on MNIST, SVHN, CIFAR-10, and ImageNet demonstrate a clear advantage of combining Lookahead\u2013minmax with Adam or extragradient, in terms of performance and improved stability, for negligible memory and computational cost. Using 30-fold fewer parameters and 16-fold smaller minibatches we outperform the reported performance of the class-dependent BigGAN on CIFAR-10 by obtaining FID of 12.19 without using the class labels, bringing state-of-the-art GAN training within reach of common computational resources.",
+    "title": "Taming GANs with Lookahead-Minmax",
+    "authors": [
+      "Tatjana Chavdarova",
+      "Matteo Pagliardini",
+      "Sebastian U Stich",
+      "Fran\u00e7ois Fleuret",
+      "Martin Jaggi"
+    ],
+    "emails": [
+      "~Matteo_Pagliardini1",
+      "~Sebastian_U_Stich1",
+      "~Martin_Jaggi1",
+      "~Fran\u00e7ois_Fleuret2",
+      "~Tatjana_Chavdarova2"
+    ],
+    "rank": 928
+  },
+  {
+    "url": "https://openreview.net/forum?id=0aW6lYOYB7d",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we consider fully connected feed-forward deep neural networks where weights and biases are independent and identically distributed according to Gaussian distributions. Extending previous results (Matthews et al., 2018a;b;Yang, 2019)  we adopt a function-space perspective, i.e. we look at neural networks as infinite-dimensional random elements on the input space <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D43C TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>I</mi></msup></math></mjx-assistive-mml></mjx-container>. Under suitable assumptions on the activation function we show that: i) a network defines a continuous Gaussian process on the input space <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D43C TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>I</mi></msup></math></mjx-assistive-mml></mjx-container>; ii) a network with re-scaled weights converges weakly to a continuous Gaussian process in the large-width limit; iii) the limiting Gaussian process has almost surely locally <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b3</mi></math></mjx-assistive-mml></mjx-container>-H\u00f6lder continuous paths, for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0</mn><mo>&lt;</mo><mi>\u03b3</mi><mo>&lt;</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container>. Our results contribute to recent theoretical studies on the interplay between infinitely wide deep neural networks and Gaussian processes by establishing weak convergence in function-space with respect to a stronger metric.",
+    "title": "Large-width functional asymptotics for deep Gaussian neural networks",
+    "authors": [
+      "Daniele Bracale",
+      "Stefano Favaro",
+      "Sandra Fortini",
+      "Stefano Peluchetti"
+    ],
+    "emails": [
+      "~Stefano_Favaro1",
+      "~Stefano_Peluchetti1",
+      "unibocconi.it",
+      "edu.unito.it"
+    ],
+    "rank": 929
+  },
+  {
+    "url": "https://openreview.net/forum?id=UFGEelJkLu5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Large-scale language models have recently demonstrated impressive empirical performance. Nevertheless, the improved results are attained at the price of bigger models, more power consumption, and slower inference, which hinder their applicability to low-resource (both memory and computation) platforms. Knowledge distillation (KD) has been demonstrated as an effective framework for compressing such big models. However, large-scale neural network systems are prone to memorize training instances, and thus tend to make inconsistent predictions when the data distribution is altered slightly. Moreover, the student model has few opportunities to request useful information from the teacher model when there is limited task-specific data available. To address these issues, we propose MixKD, a data-agnostic distillation framework that leverages mixup, a simple yet efficient data augmentation approach, to endow the resulting model with stronger generalization ability. Concretely, in addition to the original training examples, the student model is encouraged to mimic the teacher's behavior on the linear interpolation of example pairs as well. We prove from a theoretical perspective that under reasonable conditions MixKD gives rise to a smaller gap between the generalization error and the empirical error. To verify its effectiveness, we conduct experiments on the GLUE benchmark, where MixKD consistently leads to significant gains over the standard KD training, and outperforms several competitive baselines. Experiments under a limited-data setting and ablation studies further demonstrate the advantages of the proposed approach.",
+    "title": "MixKD: Towards Efficient Distillation of Large-scale Language Models",
+    "authors": [
+      "Kevin J Liang",
+      "Weituo Hao",
+      "Dinghan Shen",
+      "Yufan Zhou",
+      "Weizhu Chen",
+      "Changyou Chen",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "~Lawrence_Carin2",
+      "~Changyou_Chen1",
+      "~Kevin_J_Liang1",
+      "~Weizhu_Chen1",
+      "~Yufan_Zhou1",
+      "~Dinghan_Shen1",
+      "~Weituo_Hao1"
+    ],
+    "rank": 930
+  },
+  {
+    "url": "https://openreview.net/forum?id=p8agn6bmTbr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      3,
+      7,
+      7
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We introduce a notion of usable information contained in the representation learned by a deep network, and use it to study how optimal representations for the task emerge during training. We show that the implicit regularization coming from training with Stochastic Gradient Descent with a high learning-rate and small batch size plays an important role in learning minimal sufficient representations for the task. In the process of arriving at a minimal sufficient representation, we find that the content of the representation changes dynamically during training. In particular, we find that semantically meaningful but ultimately irrelevant information is encoded in the early transient dynamics of training, before being later discarded. In addition, we evaluate how perturbing the initial part of training impacts the learning dynamics and the resulting representations. We show these effects on both perceptual decision-making tasks inspired by neuroscience literature, as well as on standard image classification tasks.",
+    "title": "Usable Information and Evolution of Optimal Representations During Training",
+    "authors": [
+      "Michael Kleinman",
+      "Alessandro Achille",
+      "Daksh Idnani",
+      "Jonathan Kao"
+    ],
+    "emails": [
+      "~Daksh_Idnani1",
+      "~Michael_Kleinman2",
+      "~Jonathan_Kao1",
+      "~Alessandro_Achille1"
+    ],
+    "rank": 931
+  },
+  {
+    "url": "https://openreview.net/forum?id=M88oFvqp_9",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We consider the novel task of learning disentangled representations of object shape and appearance across multiple domains (e.g., dogs and cars).  The goal is to learn a generative model that learns an intermediate distribution, which borrows a subset of properties from each domain, enabling the generation of images that did not exist in any domain exclusively.  This challenging problem requires an accurate disentanglement of object shape, appearance, and background from each domain, so that the appearance and shape factors from the two domains can be interchanged. We augment an existing approach that can disentangle factors within a single domain but struggles to do so across domains.  Our key technical contribution is to represent object appearance with a differentiable histogram of visual features, and to optimize the generator so that two images with the same latent appearance factor but different latent shape factors produce similar histograms. On multiple multi-domain datasets, we demonstrate our method leads to accurate and consistent appearance and shape transfer across domains.",
+    "title": "Generating Furry Cars: Disentangling Object Shape and Appearance across Multiple Domains",
+    "authors": [
+      "Utkarsh Ojha",
+      "Krishna Kumar Singh",
+      "Yong Jae Lee"
+    ],
+    "emails": [
+      "~Yong_Jae_Lee2",
+      "~Krishna_Kumar_Singh4",
+      "~Utkarsh_Ojha1"
+    ],
+    "rank": 932
+  },
+  {
+    "url": "https://openreview.net/forum?id=qkLMTphG5-h",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.93",
+    "confidences": [
+      2,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Model-agnostic meta-learning (MAML) is a popular method for few-shot learning but assumes that we have access to the meta-training set. In practice, training on the meta-training set may not always be an option due to data privacy concerns, intellectual property issues, or merely lack of computing resources. In this paper, we consider the novel problem of repurposing pretrained MAML checkpoints to solve new few-shot classification tasks. Because of the potential distribution mismatch, the original MAML steps may no longer be optimal. Therefore we propose an alternative meta-testing procedure and combine MAML gradient steps with adversarial training and uncertainty-based stepsize adaptation. Our method outperforms \"vanilla\" MAML on same-domain and cross-domains benchmarks using both SGD and Adam optimizers and shows improved robustness to the choice of base stepsize.",
+    "title": "Repurposing Pretrained Models for Robust Out-of-domain Few-Shot Learning",
+    "authors": [
+      "Namyeong Kwon",
+      "Hwidong Na",
+      "Gabriel Huang",
+      "Simon Lacoste-Julien"
+    ],
+    "emails": [
+      "~Namyeong_Kwon1",
+      "~Hwidong_Na1",
+      "~Simon_Lacoste-Julien1",
+      "~Gabriel_Huang1"
+    ],
+    "rank": 933
+  },
+  {
+    "url": "https://openreview.net/forum?id=Db4yerZTYkz",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Shape and texture are two prominent and complementary cues for recognizing objects. Nonetheless, Convolutional Neural Networks are often biased towards either texture or shape, depending on the training dataset. Our ablation shows that such bias degenerates model performance. Motivated by this observation, we develop a simple algorithm for shape-texture debiased learning. To prevent models from exclusively attending on a single cue in representation learning, we augment training data with images with conflicting shape and texture information (eg, an image of chimpanzee shape but with lemon texture) and, most importantly, provide the corresponding supervisions from shape and texture simultaneously. \n\nExperiments show that our method successfully improves model performance on several image recognition benchmarks and adversarial robustness. For example, by training on ImageNet, it helps ResNet-152 achieve substantial improvements on ImageNet (+1.2%), ImageNet-A  (+5.2%), ImageNet-C (+8.3%) and Stylized-ImageNet (+11.1%), and on defending against FGSM adversarial attacker on ImageNet (+14.4%). Our method also claims to be compatible with other advanced data augmentation strategies, eg, Mixup, and CutMix. The code is available here: <a href=\"https://github.com/LiYingwei/ShapeTextureDebiasedTraining\" target=\"_blank\" rel=\"nofollow\">https://github.com/LiYingwei/ShapeTextureDebiasedTraining</a>.",
+    "title": "Shape-Texture Debiased Neural Network Training",
+    "authors": [
+      "Yingwei Li",
+      "Qihang Yu",
+      "Mingxing Tan",
+      "Jieru Mei",
+      "Peng Tang",
+      "Wei Shen",
+      "Alan Yuille",
+      "cihang xie"
+    ],
+    "emails": [
+      "~Wei_Shen2",
+      "~Mingxing_Tan3",
+      "~cihang_xie1",
+      "~Yingwei_Li4",
+      "~Peng_Tang1",
+      "~Qihang_Yu1",
+      "~Alan_Yuille1",
+      "~Jieru_Mei2"
+    ],
+    "rank": 934
+  },
+  {
+    "url": "https://openreview.net/forum?id=hb1sDDSLbV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      9,
+      2,
+      7,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we investigate the principle that good explanations are hard to vary in the context of deep learning.\nWe show that averaging gradients across examples -- akin to a logical OR of patterns -- can favor memorization and `patchwork' solutions that sew together different strategies, instead of identifying invariances.\nTo inspect this, we first formalize a notion of consistency for minima of the loss surface, which measures to what extent a minimum appears only when examples are pooled.\nWe then propose and experimentally validate a simple alternative algorithm based on a logical AND, that focuses on invariances and prevents memorization in a set of real-world tasks. \nFinally, using a synthetic dataset with a clear distinction between invariant and spurious mechanisms, we dissect learning signals and compare this approach to well-established regularizers.",
+    "title": "Learning explanations that are hard to vary",
+    "authors": [
+      "Giambattista Parascandolo",
+      "Alexander Neitz",
+      "ANTONIO ORVIETO",
+      "Luigi Gresele",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Giambattista_Parascandolo1",
+      "~ANTONIO_ORVIETO2",
+      "~Alexander_Neitz1",
+      "~Luigi_Gresele1"
+    ],
+    "rank": 935
+  },
+  {
+    "url": "https://openreview.net/forum?id=p65lWYKpqKz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6,
+      8
+    ],
+    "rating": "5.93",
+    "confidences": [
+      3,
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Modeling the dynamics of real-world physical systems is critical for spatiotemporal prediction tasks, but challenging when data is limited. The scarcity of real-world data and the difficulty in reproducing the data distribution hinder directly applying meta-learning techniques. Although the knowledge of governing partial differential equations (PDE) of the data can be helpful for the fast adaptation to few observations, it is mostly infeasible to exactly find the equation for observations in real-world physical systems. In this work, we propose a framework, physics-aware meta-learning with auxiliary tasks whose spatial modules incorporate PDE-independent knowledge and temporal modules utilize the generalized features from the spatial modules to be adapted to the limited data, respectively. The framework is inspired by a local conservation law expressed mathematically as a continuity equation and does not require the exact form of governing equation to model the spatiotemporal observations. The proposed method mitigates the need for a large number of real-world tasks for meta-learning by leveraging spatial information in simulated data to meta-initialize the spatial modules. We apply the proposed framework to both synthetic and real-world spatiotemporal prediction tasks and demonstrate its superior performance with limited observations.",
+    "title": "Physics-aware Spatiotemporal Modules with Auxiliary Tasks for Meta-Learning",
+    "authors": [
+      "Sungyong Seo",
+      "Chuizheng Meng",
+      "Sirisha Rambhatla",
+      "Yan Liu"
+    ],
+    "emails": [
+      "~Yan_Liu1",
+      "~Sungyong_Seo1",
+      "~Chuizheng_Meng1",
+      "~Sirisha_Rambhatla1"
+    ],
+    "rank": 936
+  },
+  {
+    "url": "https://openreview.net/forum?id=hSjxQ3B7GWq",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.93",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Despite significant progress in challenging problems across various domains, applying state-of-the-art deep reinforcement learning (RL) algorithms remains challenging due to their sensitivity to the choice of hyperparameters. This sensitivity can partly be attributed to the non-stationarity of the RL problem, potentially requiring different hyperparameter settings at various stages of the learning process. Additionally, in the RL setting, hyperparameter optimization (HPO) requires a large number of environment interactions, hindering the transfer of the successes in RL to real-world applications. In this work, we tackle the issues of sample-efficient and dynamic HPO in RL. We propose a population-based automated RL (AutoRL) framework to meta-optimize arbitrary off-policy RL algorithms. In this framework, we optimize the hyperparameters and also the neural architecture while simultaneously training the agent. By sharing the collected experience across the population, we substantially increase the sample efficiency of the meta-optimization. We demonstrate the capabilities of our sample-efficient AutoRL approach in a case study with the popular TD3 algorithm in the MuJoCo benchmark suite, where we reduce the number of environment interactions needed for meta-optimization by up to an order of magnitude compared to population-based training.",
+    "title": "Sample-Efficient Automated Deep Reinforcement Learning",
+    "authors": [
+      "J\u00f6rg K.H. Franke",
+      "Gregor Koehler",
+      "Andr\u00e9 Biedenkapp",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "~Frank_Hutter1",
+      "~J\u00f6rg_K.H._Franke1",
+      "~Andr\u00e9_Biedenkapp1",
+      "~Gregor_Koehler1"
+    ],
+    "rank": 937
+  },
+  {
+    "url": "https://openreview.net/forum?id=65MxtdJwEnl",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6
+    ],
+    "rating": "5.92",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Neural Controlled Differential Equations (Neural CDEs) are the continuous-time analogue of an RNN, just as Neural ODEs are analogous to ResNets. However just like RNNs, training Neural CDEs can be difficult for long time series. Here, we propose to apply a technique drawn from stochastic analysis, namely the log-ODE method. Instead of using the original input sequence, our procedure summarises the information over local time intervals via the log-signature map, and uses the resulting shorter stream of log-signatures as the new input. This represents a length/channel trade-off. In doing so we demonstrate efficacy on problems of length up to 17k observations and observe significant training speed-ups, improvements in model performance, and reduced memory requirements compared to the existing algorithm.",
+    "title": "Neural CDEs for Long Time Series via the Log-ODE Method",
+    "authors": [
+      "James Morrill",
+      "Patrick Kidger",
+      "Cristopher Salvi",
+      "James Foster",
+      "Terry Lyons"
+    ],
+    "emails": [
+      "maths.ox.ac.uk",
+      "~Patrick_Kidger1"
+    ],
+    "rank": 938
+  },
+  {
+    "url": "https://openreview.net/forum?id=lvRTC669EY_",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.92",
+    "confidences": [
+      3,
+      3,
+      2,
+      4
+    ],
+    "abstract": "We propose a simple, general and effective technique, Reward Randomization for discovering diverse strategic policies in complex multi-agent games. Combining reward randomization and policy gradient, we derive a new algorithm, Reward-Randomized Policy Gradient (RPG). RPG is able to discover a set of multiple distinctive human-interpretable strategies in challenging temporal trust dilemmas, including grid-world games and a real-world game Agar.io, where multiple equilibria exist but standard multi-agent policy gradient algorithms always converge to a fixed one with a sub-optimal payoff for every player even using state-of-the-art exploration techniques. Furthermore, with the set of diverse strategies from RPG, we can (1) achieve higher payoffs by fine-tuning the best policy from the set; and (2) obtain an adaptive agent by using this set of strategies as its training opponents. ",
+    "title": "Discovering Diverse Multi-Agent Strategic Behavior via Reward Randomization",
+    "authors": [
+      "Zhenggang Tang",
+      "Chao Yu",
+      "Boyuan Chen",
+      "Huazhe Xu",
+      "Xiaolong Wang",
+      "Fei Fang",
+      "Simon Shaolei Du",
+      "Yu Wang",
+      "Yi Wu"
+    ],
+    "emails": [
+      "mails.tsinghua.edu.cn",
+      "pku.edu.cn",
+      "~Boyuan_Chen2",
+      "~Yu_Wang3",
+      "~Fei_Fang1",
+      "~Huazhe_Xu1",
+      "~Xiaolong_Wang3",
+      "~Simon_Shaolei_Du1",
+      "~Yi_Wu1"
+    ],
+    "rank": 939
+  },
+  {
+    "url": "https://openreview.net/forum?id=HmAhqnu3qu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7
+    ],
+    "rating": "5.92",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Graph Neural Networks (GNNs) have become the state-of-the-art method for many applications on graph structured data. GNNs are a framework for graph representation learning, where a model learns to generate low dimensional node embeddings that encapsulate structural and feature-related information. GNNs are usually trained in an end-to-end fashion, leading to highly specialized node embeddings. While this approach achieves great results in the single-task setting, generating node embeddings that can be used to perform multiple tasks (with performance comparable to single-task models) is an open problem. We propose a novel representation learning strategy, based on meta-learning, capable of producing multi-task node embeddings. Our method avoids the difficulties arising when learning to perform multiple tasks concurrently by, instead, learning to quickly (i.e. with a few steps of gradient descent) adapt to multiple tasks singularly.  We show that the embeddings produced by our method can be used to perform multiple tasks with comparable or higher performance than both single-task and multi-task end-to-end models. Our method is model-agnostic and task-agnostic and can hence be applied to a wide variety of multi-task domains.",
+    "title": "Graph Representation Learning for Multi-Task Settings: a Meta-Learning Approach",
+    "authors": [
+      "Davide Buffelli",
+      "Fabio Vandin"
+    ],
+    "emails": [
+      "~Fabio_Vandin2",
+      "~Davide_Buffelli1"
+    ],
+    "rank": 940
+  },
+  {
+    "url": "https://openreview.net/forum?id=UfJn-cstSF",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.92",
+    "confidences": [
+      3,
+      5,
+      1,
+      4
+    ],
+    "abstract": "The learned iterative shrinkage thresholding algorithm (LISTA) introduces deep unfolding models with learnable thresholds in the shrinkage function for sparse coding. Drawing on some theoretical insights, we advocate an error-based thresholding (EBT) mechanism for LISTA, which leverages a function of the layer-wise reconstruction error to suggest an appropriate threshold value for each observation on each layer. We show that the EBT mechanism well-disentangles the learnable parameters in the shrinkage functions from the reconstruction errors, making them more adaptive to the various observations. With rigorous theoretical analyses, we show that the proposed EBT can lead to faster convergence on the basis of LISTA and its variants, in addition to its higher adaptivity. Extensive experimental results confirm our theoretical analyses and verify the effectiveness of our methods.",
+    "title": "Learned ISTA with Error-based Thresholding for Adaptive Sparse Coding",
+    "authors": [
+      "Li Ziang",
+      "Wu Kailun",
+      "Yiwen Guo",
+      "Changshui Zhang"
+    ],
+    "emails": [
+      "~Yiwen_Guo1",
+      "~Li_Ziang1",
+      "~Wu_Kailun1",
+      "~Changshui_Zhang2"
+    ],
+    "rank": 941
+  },
+  {
+    "url": "https://openreview.net/forum?id=LIR3aVGIlln",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      8
+    ],
+    "rating": "5.92",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "A point process describes how random sets of exchangeable points are generated. The points usually influence the positions of each other via attractive and repulsive forces. To model this behavior, it is enough to transform the samples from the uniform process with a sufficiently complex equivariant function. However, learning the parameters of the resulting process is challenging since the likelihood is hard to estimate and often intractable. This leads us to our proposed model - CONFET. Based on continuous normalizing flows, it allows arbitrary interactions between points while having tractable likelihood. Experiments on various real and synthetic datasets show the improved performance of our new scalable approach.",
+    "title": "Equivariant Normalizing Flows for Point Processes and Sets",
+    "authors": [
+      "Marin Bilo\u0161",
+      "Stephan G\u00fcnnemann"
+    ],
+    "emails": [
+      "~Stephan_G\u00fcnnemann1",
+      "~Marin_Bilo\u01611"
+    ],
+    "rank": 942
+  },
+  {
+    "url": "https://openreview.net/forum?id=9az9VKjOx00",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.92",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "We present the Topology Transformation Equivariant Representation (TopoTER) learning, a general paradigm of unsupervised learning of node representations of graph data for the wide applicability to Graph Convolutional Neural Networks (GCNNs). We formalize the TopoTER from an information-theoretic perspective, by maximizing the mutual information between topology transformations and node representations before and after the transformations. We derive that maximizing such mutual information can be relaxed to minimizing the cross entropy between the applied topology transformation and its estimation from node representations. In particular, we seek to sample a subset of node pairs from the original graph and flip the edge connectivity between each pair to transform the graph topology. Then, we self-train a representation encoder to learn node representations by reconstructing the topology transformations from the feature representations of the original and transformed graphs. In experiments, we apply the TopoTER to the downstream node and graph classification tasks, and results show that the TopoTER outperforms the state-of-the-art unsupervised approaches.",
+    "title": "TopoTER: Unsupervised Learning of Topology Transformation Equivariant Representations",
+    "authors": [
+      "Xiang Gao",
+      "Wei Hu",
+      "Guo-Jun Qi"
+    ],
+    "emails": [
+      "~Guo-Jun_Qi1",
+      "~Wei_Hu6",
+      "~Xiang_Gao2"
+    ],
+    "rank": 943
+  },
+  {
+    "url": "https://openreview.net/forum?id=-kfLEqppEm_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      5,
+      5
+    ],
+    "rating": "5.92",
+    "confidences": [
+      1,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Monte-Carlo planning and Reinforcement Learning (RL) are essential to sequential decision making. The recent AlphaGo and AlphaZero algorithms have shown how to successfully combine these two paradigms to solve large scale sequential decision problems. These methodologies exploit a variant of the well-known UCT algorithm to trade off the exploitation of good actions and the exploration of unvisited states, but their empirical success comes at the cost of poor sample-efficiency and high computation time. In this paper, we overcome these limitations by studying the benefit of convex regularization in Monte-Carlo Tree Search (MCTS) to drive exploration efficiently and to improve policy updates, as already observed in RL. First, we introduce a unifying theory on the use of generic convex regularizers in MCTS, deriving the first regret analysis of regularized MCTS and showing that it guarantees an exponential convergence rate. Second, we exploit our theoretical framework to introduce novel regularized backup operators for MCTS, based on the relative entropy of the policy update and on the Tsallis entropy of the policy. We provide an intuitive demonstration of the effect of each regularizer empirically verifying the consequence of our theoretical results on a toy problem. Finally, we show how our framework can easily be incorporated in AlphaGo and AlphaZero, and we empirically show the superiority of convex regularization w.r.t. representative baselines, on well-known RL problems across several Atari games.",
+    "title": "Convex Regularization in Monte-Carlo Tree Search",
+    "authors": [
+      "Tuan Quang Dam",
+      "Carlo D'Eramo",
+      "Jan Peters",
+      "Joni Pajarinen"
+    ],
+    "emails": [
+      "~Carlo_D'Eramo2",
+      "~Tuan_Quang_Dam1",
+      "~Joni_Pajarinen2",
+      "~Jan_Peters3"
+    ],
+    "rank": 944
+  },
+  {
+    "url": "https://openreview.net/forum?id=IJxaSrLIbkx",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.92",
+    "confidences": [
+      3,
+      5,
+      2,
+      2
+    ],
+    "abstract": "Explanations of Machine Learning (ML) models often address a \u2018Why?\u2019 question. Such explanations can be related with selecting feature-value pairs which are sufficient for the prediction. Recent work has investigated explanations that address\na \u2018Why Not?\u2019 question, i.e. finding a change of feature values that guarantee a change of prediction. Given their goals, these two forms of explaining predictions of ML models appear to be mostly unrelated. However, this paper demonstrates otherwise, and establishes a rigorous formal relationship between \u2018Why?\u2019 and \u2018Why Not?\u2019 explanations. Concretely, the paper proves that, for any given instance, \u2018Why?\u2019 explanations are minimal hitting sets of \u2018Why Not?\u2019 explanations and vice-versa. Furthermore, the paper devises novel algorithms for extracting and enumerating both forms of explanations.\n",
+    "title": "On Relating \"Why?\" and \"Why Not?\" Explanations",
+    "authors": [
+      "Alexey Ignatiev",
+      "Nina Narodytska",
+      "Nicholas Asher",
+      "Joao Marques-Silva"
+    ],
+    "emails": [
+      "~Nina_Narodytska1",
+      "irit.fr",
+      "~Joao_Marques-Silva1",
+      "~Alexey_Ignatiev1"
+    ],
+    "rank": 945
+  },
+  {
+    "url": "https://openreview.net/forum?id=17VnwXYZyhH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.92",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Recently, a variety of probing tasks are proposed to discover linguistic properties learned in contextualized word embeddings. Many of these works implicitly assume these embeddings lay in certain metric spaces, typically the Euclidean space. This work considers a family of geometrically special spaces, the hyperbolic spaces, that exhibit better inductive biases for hierarchical structures and may better reveal linguistic hierarchies encoded in contextualized representations. We introduce a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-utext variant=\"italic\" style=\"font-size: 82.4%; padding: 0.91em 0px 0.243em; font-family: MJXZERO, serif; font-style: italic;\">\u00e9</mjx-utext><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">Poincar\u00e9 probe</mtext></math></mjx-assistive-mml></mjx-container>, a structural probe projecting these embeddings into a Poincar\u00e9 subspace with explicitly defined hierarchies. We focus on two probing objectives: (a) dependency trees where the hierarchy is defined as head-dependent structures; (b) lexical sentiments where the hierarchy is defined as the polarity of words (positivity and negativity). We argue that a key desideratum of a probe is its sensitivity to the existence of linguistic structures. We apply our probes on BERT, a typical contextualized embedding model. In a syntactic subspace, our probe better recovers tree structures than Euclidean probes, revealing the possibility that the geometry of BERT syntax may not necessarily be Euclidean. In a sentiment subspace, we reveal two possible meta-embeddings for positive and negative sentiments and show how lexically-controlled contextualization would change the geometric localization of embeddings. We demonstrate the findings with our Poincar\u00e9 probe via extensive experiments and visualization. Our results can be reproduced at <a href=\"https://github.com/FranxYao/PoincareProbe\" target=\"_blank\" rel=\"nofollow\">https://github.com/FranxYao/PoincareProbe</a>",
+    "title": "Probing BERT in Hyperbolic Spaces",
+    "authors": [
+      "Boli Chen",
+      "Yao Fu",
+      "Guangwei Xu",
+      "Pengjun Xie",
+      "Chuanqi Tan",
+      "Mosha Chen",
+      "Liping Jing"
+    ],
+    "emails": [
+      "~Liping_Jing3",
+      "taobao.com",
+      "~Yao_Fu3",
+      "~Boli_Chen1",
+      "alibaba-inc.com"
+    ],
+    "rank": 946
+  },
+  {
+    "url": "https://openreview.net/forum?id=DHSNrGhAY7W",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      7
+    ],
+    "rating": "5.92",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Lipschitz constants of neural networks have been explored in various contexts in deep learning, such as provable adversarial robustness, estimating Wasserstein distance, stabilising training of GANs, and formulating invertible neural networks. Such works have focused on bounding the Lipschitz constant of fully connected or convolutional networks, composed of linear maps and pointwise non-linearities. In this paper, we investigate the Lipschitz constant of self-attention, a non-linear neural network module widely used in sequence modelling. We prove that the standard dot-product self-attention is *not* Lipschitz, and propose an alternative L2 self-attention that *is* Lipschitz. We derive an upper bound on the Lipschitz constant of L2 self-attention and provide empirical evidence for its asymptotic tightness. To demonstrate the practical relevance of our theoretical work, we formulate invertible self-attention and use it in a Transformer-based architecture for a character-level language modelling task.",
+    "title": "The Lipschitz Constant of Self-Attention",
+    "authors": [
+      "Hyunjik Kim",
+      "George Papamakarios",
+      "Andriy Mnih"
+    ],
+    "emails": [
+      "~George_Papamakarios1",
+      "~Andriy_Mnih1",
+      "~Hyunjik_Kim1"
+    ],
+    "rank": 947
+  },
+  {
+    "url": "https://openreview.net/forum?id=y_pDlU_FLS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      8
+    ],
+    "rating": "5.92",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Learned optimizers are algorithms that can themselves be trained to solve optimization problems. In contrast to baseline optimizers (such as momentum or Adam) that use simple update rules derived from intuitive principles, learned optimizers use flexible, high-dimensional, nonlinear parameterizations. Although this can lead to optimizers with better performance in certain settings, their inner workings remain a mystery. How is it that a learned optimizer is able to outperform a well tuned baseline? Has it learned a sophisticated method for combining existing optimization techniques, or is it implementing completely new behavior? In this work, we address these questions by visualizing and understanding learned optimizers. We study learned optimizers trained from scratch on three disparate tasks, and discovered that they have learned interpretable mechanisms, including: momentum, gradient clipping, schedules, and a new form of learning rate adaptation. Moreover, we show how the dynamics of trained learned optimizers enables these behaviors. Our results elucidate the previously murky understanding of what learned optimizers learn, and establishes tools for interpreting future learned optimizers.",
+    "title": "Reverse engineering learned optimizers reveals known and novel mechanisms",
+    "authors": [
+      "Niru Maheswaranathan",
+      "David Sussillo",
+      "Luke Metz",
+      "Ruoxi Sun",
+      "Jascha Sohl-Dickstein"
+    ],
+    "emails": [
+      "~Jascha_Sohl-Dickstein2",
+      "~Niru_Maheswaranathan1",
+      "~Luke_Metz1",
+      "~Ruoxi_Sun2",
+      "~David_Sussillo1"
+    ],
+    "rank": 948
+  },
+  {
+    "url": "https://openreview.net/forum?id=rI3RMgDkZqJ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.92",
+    "confidences": [
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Safe reinforcement learning (SRL) problems are typically modeled as constrained Markov Decision Process (CMDP), in which an agent explores the environment to maximize the expected total reward and meanwhile avoids violating certain constraints on a number of expected total costs. In general, such SRL problems have nonconvex objective functions subject to multiple nonconvex constraints, and hence are very challenging to solve, particularly to provide a globally optimal policy. Many popular SRL algorithms adopt a primal-dual structure which utilizes the updating of dual variables for satisfying the constraints. In contrast, we propose a primal approach, called constraint-rectified policy optimization (CRPO), which updates the policy alternatingly between objective improvement and constraint satisfaction. CRPO provides a primal-type algorithmic framework to solve SRL problems, where each policy update can take any variant of policy optimization step. To demonstrate the theoretical performance of CRPO, we adopt natural policy gradient (NPG) for each policy update step and show that CRPO achieves an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><msqrt><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> convergence rate to the global optimal policy in the constrained policy set and an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><msqrt><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> error bound on constraint satisfaction. This is the first finite-time analysis of SRL algorithms with global optimality guarantee. Our empirical results demonstrate that CRPO can outperform the existing primal-dual baseline algorithms significantly.",
+    "title": "A Primal Approach to Constrained Policy Optimization: Global Optimality and Finite-Time Analysis",
+    "authors": [
+      "Tengyu Xu",
+      "Yingbin Liang",
+      "Guanghui Lan"
+    ],
+    "emails": [
+      "~Yingbin_Liang1",
+      "~Tengyu_Xu1",
+      "~Guanghui_Lan1"
+    ],
+    "rank": 949
+  },
+  {
+    "url": "https://openreview.net/forum?id=refmbBH_ysO",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      7
+    ],
+    "rating": "5.91",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Spreadsheet formula prediction has been an important program synthesis problem with many real-world applications. Previous works typically utilize input-output examples as the specification for spreadsheet formula synthesis, where each input-output pair simulates a separate row in the spreadsheet. However, such a formulation does not fully capture the rich context in real-world spreadsheets. First, spreadsheet data entries are organized as tables, thus rows and columns are not necessarily independent from each other. In addition, many spreadsheet tables include headers, which provide high-level descriptions of the cell data. However, previous synthesis approaches do not consider headers as part of the specification. In this work, we present the first approach for synthesizing spreadsheet formulas from tabular context, which includes both headers and semi-structured tabular data. In particular, we propose SpreadsheetCoder, a BERT-based model architecture to represent the tabular context in both row-based and column-based formats. We train our model on a large dataset of spreadsheets, and demonstrate that SpreadsheetCoder achieves top-1 prediction accuracy of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>42.51</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>, which is a considerable improvement over baselines that do not employ rich tabular context.",
+    "title": "SpreadsheetCoder: Formula Prediction from Semi-structured Context",
+    "authors": [
+      "Xinyun Chen",
+      "Petros Maniatis",
+      "Rishabh Singh",
+      "Charles Sutton",
+      "Hanjun Dai",
+      "Max Lin",
+      "Denny Zhou"
+    ],
+    "emails": [
+      "~Charles_Sutton1",
+      "~Rishabh_Singh1",
+      "~Petros_Maniatis1",
+      "google.com",
+      "~Denny_Zhou1",
+      "~Hanjun_Dai1",
+      "~Xinyun_Chen1"
+    ],
+    "rank": 950
+  },
+  {
+    "url": "https://openreview.net/forum?id=taQNxF9Sj6",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4
+    ],
+    "rating": "5.91",
+    "confidences": [
+      2,
+      5,
+      4
+    ],
+    "abstract": "Fine-tuning a pretrained transformer for a downstream task has become a standard method in NLP in the last few years. While the results from these models are impressive, applying them can be extremely computationally expensive, as is pretraining new models with the latest architectures. We present a novel method for applying pretrained transformer language models which lowers their memory requirement both at training and inference time. An additional benefit is that our method removes the fixed context size constraint that most transformer models have, allowing for more flexible use. When applied to the GPT-2 language model, we find that our method attains better perplexity than an unmodified GPT-2 model on the PG-19 and WikiText-103 corpora, for a given amount of computation or memory.",
+    "title": " Adding Recurrence to Pretrained Transformers",
+    "authors": [
+      "Davis Yoshida",
+      "Allyson Ettinger",
+      "Kevin Gimpel"
+    ],
+    "emails": [
+      "~Allyson_Ettinger1",
+      "~Kevin_Gimpel1",
+      "~Davis_Yoshida1"
+    ],
+    "rank": 951
+  },
+  {
+    "url": "https://openreview.net/forum?id=n1wPkibo2R",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7
+    ],
+    "rating": "5.91",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "We give a distributed protocol with nearly-optimal communication and number of rounds for Column Subset Selection with respect to the entrywise {<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container>} norm (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-CSS<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-n\" noic=\"true\"></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi></mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container>), and more generally, for the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container>-norm with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2264\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mo>\u2264</mo><mi>p</mi><mo>&lt;</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container>. We study matrix factorization in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container>-norm loss, rather than the more standard Frobenius norm loss, because the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> norm is more robust to noise, which is observed to lead to improved performance in a wide range of computer vision and robotics problems.\nIn the distributed setting, we consider <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>s</mi></math></mjx-assistive-mml></mjx-container> servers in the standard coordinator model of communication, where the columns of the input matrix <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><mi>d</mi><mo>\u00d7</mo><mi>n</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>\u226b</mo><mi>d</mi></math></mjx-assistive-mml></mjx-container>) are distributed across the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>s</mi></math></mjx-assistive-mml></mjx-container> servers. We give a protocol in this model with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"13\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.187em; margin-bottom: -0.597em;\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2DC TEX-S1\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo>~</mo></mover></mrow><mo stretchy=\"false\">(</mo><mi>s</mi><mi>d</mi><mi>k</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> communication, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"14\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn></math></mjx-assistive-mml></mjx-container> round, and polynomial running time, and which achieves a multiplicative <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"15\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.461em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-mtext class=\"mjx-n\" style=\"color: red;\"><mjx-c class=\"mjx-c5C\"></mjx-c><mjx-c class=\"mjx-c70\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c79\"></mjx-c></mjx-mtext><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>k</mi><mrow><mfrac><mn>1</mn><mi>p</mi></mfrac><mo>\u2212</mo><mfrac><mn>1</mn><mn>2</mn></mfrac></mrow></msup><mtext mathcolor=\"red\">\\poly</mtext><mo stretchy=\"false\">(</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi><mi>d</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>-approximation to the best possible column subset. A key ingredient in our proof is the reduction to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"16\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mi>p</mi><mo>,</mo><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container>-norm, which corresponds to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"17\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container>-norm of the vector of Euclidean norms of each of the columns of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"18\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container>. This enables us to use strong coreset constructions for Euclidean norms, which previously had not been used in this context. This naturally also allows us to implement our algorithm in the popular streaming model of computation. We further propose a greedy algorithm for selecting columns, which can be used by the coordinator, and show the first provable guarantees for a greedy algorithm for the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"19\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mn>1</mn><mo>,</mo><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> norm. Finally, we implement our protocol and give significant practical advantages on real-world data analysis tasks.",
+    "title": "An Efficient Protocol for Distributed Column Subset Selection in the Entrywise \u2113p Norm",
+    "authors": [
+      "Shuli Jiang",
+      "Dongyu Li",
+      "Irene Mengze Li",
+      "Arvind V. Mahankali",
+      "David Woodruff"
+    ],
+    "emails": [
+      "andrew.cmu.edu",
+      "cs.cmu.edu",
+      "~Shuli_Jiang1",
+      "~David_Woodruff2",
+      "~Arvind_V._Mahankali1"
+    ],
+    "rank": 952
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZUW6N_SVKxq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.91",
+    "confidences": [
+      3,
+      3,
+      3,
+      2
+    ],
+    "abstract": "We define deep kernel processes in which positive definite Gram matrices are progressively transformed by nonlinear kernel functions and by sampling from (inverse) Wishart distributions. Remarkably, we find that deep Gaussian processes (DGPs), Bayesian neural networks (BNNs), infinite BNNs, and infinite BNNs with bottlenecks can all be written as deep kernel processes. For DGPs the equivalence arises because the Gram matrix formed by the inner product of features is Wishart distributed, and as we show, standard isotropic kernels can be written entirely in terms of this Gram matrix (we do not need knowledge of the underlying features). We define a tractable deep kernel process, the deep inverse Wishart process and give a doubly-stochastic inducing-point variational inference scheme that operates on the Gram matrices, not on the features (as in DGPs). We show that the deep inverse Wishart process gives superior performance to DGPs and infinite BNN on standard fully-connected baselines.",
+    "title": "Deep Kernel Processes",
+    "authors": [
+      "Laurence Aitchison",
+      "Adam X. Yang",
+      "Sebastian W. Ober"
+    ],
+    "emails": [
+      "~Laurence_Aitchison1",
+      "~Adam_X._Yang1",
+      "~Sebastian_W._Ober1"
+    ],
+    "rank": 953
+  },
+  {
+    "url": "https://openreview.net/forum?id=4HGL3H9eL9U",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5
+    ],
+    "rating": "5.91",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "With the rapid development of adversarial machine learning, numerous adversarial attack methods have been proposed. Typical attacks are based on a search in the neighborhood of input image to generate a perturbed adversarial example. Since 2017, generative models are adopted for adversarial attacks, and most of them focus on generating adversarial perturbations from input noise or input image. Thus the output is restricted by input for these works. A recent work targets unrestricted adversarial example using generative model but their method is based on a search in the neighborhood of input noise, so actually their output is still constrained by input. In this work, we propose AT-GAN (Adversarial Transfer on Generative Adversarial Net) to train an adversarial generative model that can directly produce adversarial examples. Different from  previous works, we aim to learn the distribution of adversarial examples so as to generate semantically meaningful adversaries. AT-GAN achieves this goal by first learning a generative model for real data, followed by transfer learning to obtain the desired generative model. Once trained and transferred, AT-GAN could generate adversarial examples directly and quickly for any input noise, denoted as non-constrained adversarial examples. Extensive experiments and visualizations show that AT-GAN can efficiently generate diverse adversarial examples that are realistic to human perception, and yields higher attack success rates against adversarially trained models.\n\n ",
+    "title": "AT-GAN: An Adversarial Generative Model for Non-constrained Adversarial Examples",
+    "authors": [
+      "Xiaosen Wang",
+      "Kun He",
+      "Chuanbiao Song",
+      "Liwei Wang",
+      "John E. Hopcroft"
+    ],
+    "emails": [
+      "~Chuanbiao_Song2",
+      "~Kun_He1",
+      "~Xiaosen_Wang1",
+      "~John_E._Hopcroft1",
+      "~Liwei_Wang1"
+    ],
+    "rank": 954
+  },
+  {
+    "url": "https://openreview.net/forum?id=HC5VgCHtU10",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.91",
+    "confidences": [
+      2,
+      4,
+      2,
+      3
+    ],
+    "abstract": "Keystroke inference attacks are a form of side-channels attacks in which an attacker leverages various techniques to recover a user\u2019s keystrokes as she inputs information into some display (for example, while sending a text message or entering her pin). Typically, these attacks leverage machine learning approaches, but assessing the realism of the threat space has lagged behind the pace of machine learning advancements, due in-part, to the challenges in curating large real-life datasets. This paper aims to overcome the challenge of having limited number of real data by introducing a video domain adaptation technique that is able to leverage synthetic data through supervised disentangled learning. Specifically, for a given domain, we decompose the observed data into two factors of variation: Style and Content. Doing so provides four learned representations: real-life style, synthetic style, real-life content and synthetic content. Then, we combine them into feature representations from all combinations of style-content pairings across domains, and train a model on these combined representations to classify the content (i.e., labels) of a given datapoint in the style of another domain. We evaluate our method on real-life data using a variety of metrics to quantify the amount of information an attacker is able to recover. We show that our method prevents our model from overfitting to a small real-life training set, indicating that our method is an effective form of data augmentation. Code and data will be released after reviewal. ",
+    "title": "Disentangling style and content for low resource video domain adaptation: a case study on keystroke inference attacks",
+    "authors": [
+      "John Lim",
+      "Fabian Monrose",
+      "Jan-Michael Frahm"
+    ],
+    "emails": [
+      "~Fabian_Monrose1",
+      "~John_Lim1",
+      "~Jan-Michael_Frahm1"
+    ],
+    "rank": 955
+  },
+  {
+    "url": "https://openreview.net/forum?id=6t_dLShIUyZ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      8,
+      6,
+      3,
+      8
+    ],
+    "rating": "5.90",
+    "confidences": [
+      3,
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Greedy-GQ is a value-based reinforcement learning (RL) algorithm for optimal control. Recently, the finite-time analysis of Greedy-GQ has been developed under linear function approximation and Markovian sampling, and the algorithm is shown to achieve an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-stationary point with a sample complexity in the order of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>3</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. Such a high sample complexity is due to the large variance induced by the Markovian samples. In this paper, we propose a variance-reduced Greedy-GQ (VR-Greedy-GQ) algorithm for off-policy optimal control. In particular, the algorithm applies the SVRG-based variance reduction scheme to reduce the stochastic variance of the two time-scale updates. We study the finite-time convergence of VR-Greedy-GQ under linear function approximation and Markovian sampling and show that the algorithm achieves a much smaller bias and variance error than the original Greedy-GQ. In particular, we prove that VR-Greedy-GQ achieves an improved sample complexity that is in the order of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. We further compare the performance of VR-Greedy-GQ with that of Greedy-GQ in various RL experiments to corroborate our theoretical findings.",
+    "title": "Greedy-GQ with Variance Reduction: Finite-time Analysis and Improved Complexity",
+    "authors": [
+      "Shaocong Ma",
+      "Ziyi Chen",
+      "Yi Zhou",
+      "Shaofeng Zou"
+    ],
+    "emails": [
+      "~Ziyi_Chen2",
+      "~Shaocong_Ma1",
+      "~Yi_Zhou2",
+      "~Shaofeng_Zou1"
+    ],
+    "rank": 956
+  },
+  {
+    "url": "https://openreview.net/forum?id=WC04PD6dFrP",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      8
+    ],
+    "rating": "5.90",
+    "confidences": [
+      3,
+      3,
+      3,
+      1
+    ],
+    "abstract": "We consider off-policy evaluation (OPE) in continuous action domains, such as dynamic pricing and personalized dose finding. In OPE, one aims to learn the value under a new policy using historical data generated by a different behavior policy. Most existing works on OPE focus on discrete action domains. To handle continuous action space, we develop a brand-new deep jump Q-evaluation method for OPE. The key ingredient of our method lies in adaptively discretizing the action space using deep jump Q-learning. This allows us to apply existing OPE methods in discrete domains to handle continuous actions. Our method is further justified by theoretical results, synthetic and real datasets.\n",
+    "title": "Deep Jump Q-Evaluation for Offline Policy Evaluation in Continuous Action Space",
+    "authors": [
+      "Hengrui Cai",
+      "Chengchun Shi",
+      "Rui Song",
+      "Wenbin Lu"
+    ],
+    "emails": [
+      "~Rui_Song2",
+      "ncsu.edu",
+      "lse.ac.uk",
+      "~Hengrui_Cai1"
+    ],
+    "rank": 957
+  },
+  {
+    "url": "https://openreview.net/forum?id=d8Q1mt2Ghw",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.90",
+    "confidences": [
+      3,
+      2,
+      2,
+      3
+    ],
+    "abstract": "For autonomous vehicles to safely share the road with human drivers, autonomous vehicles must abide by specific \"road rules\" that human drivers have agreed to follow. \"Road rules\" include rules that drivers are required to follow by law \u2013 such as the requirement that vehicles stop at red lights \u2013 as well as more subtle social rules \u2013 such as the implicit designation of fast lanes on the highway. In this paper, we provide empirical evidence that suggests that \u2013 instead of hard-coding road rules into self-driving algorithms \u2013 a scalable alternative may be to design multi-agent environments in which road rules emerge as optimal solutions to the problem of maximizing traffic flow.  We analyze what ingredients in driving environments cause the emergence of these road rules and find that two crucial factors are noisy perception and agents\u2019 spatial density.  We provide qualitative and quantitative evidence of the emergence of seven social driving behaviors, ranging from obeying traffic signals to following lanes, all of which emerge from training agents to drive quickly to destinations without colliding. Our results add empirical support for the social road rules that countries worldwide have agreed on for safe, efficient driving.",
+    "title": "Emergent Road Rules In Multi-Agent Driving Environments",
+    "authors": [
+      "Avik Pal",
+      "Jonah Philion",
+      "Yuan-Hong Liao",
+      "Sanja Fidler"
+    ],
+    "emails": [
+      "~Jonah_Philion1",
+      "~Avik_Pal1",
+      "~Sanja_Fidler1",
+      "~Yuan-Hong_Liao2"
+    ],
+    "rank": 958
+  },
+  {
+    "url": "https://openreview.net/forum?id=nVZtXBI6LNn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.89",
+    "confidences": [
+      4,
+      2,
+      1,
+      2
+    ],
+    "abstract": "Formal verification of neural networks (NNs) is a challenging and important problem. Existing efficient complete solvers typically require the branch-and-bound (BaB) process, which splits the problem domain into sub-domains and solves each sub-domain using faster but weaker incomplete verifiers, such as Linear Programming (LP) on linearly relaxed sub-domains.  In this paper, we propose to use the backward mode linear relaxation based perturbation analysis (LiRPA) to replace LP during the BaB process, which can be efficiently implemented on the typical machine learning accelerators such as GPUs and TPUs.  However, unlike LP, LiRPA when applied naively can produce much weaker bounds and even cannot check certain conflicts of sub-domains during splitting, making the entire procedure incomplete after BaB. To address these challenges, we apply a fast gradient based bound tightening procedure combined with batch splits and the design of minimal usage of LP bound procedure, enabling us to effectively use LiRPA on the accelerator hardware for the challenging complete NN verification problem and significantly outperform LP-based approaches. On a single GPU, we demonstrate an order of magnitude speedup compared to existing LP-based approaches.",
+    "title": "Fast and Complete: Enabling Complete Neural Network Verification with Rapid and Massively Parallel Incomplete Verifiers",
+    "authors": [
+      "Kaidi Xu",
+      "Huan Zhang",
+      "Shiqi Wang",
+      "Yihan Wang",
+      "Suman Jana",
+      "Xue Lin",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Xue_Lin1",
+      "~Cho-Jui_Hsieh1",
+      "~Yihan_Wang2",
+      "~Shiqi_Wang2",
+      "~Suman_Jana1",
+      "~Kaidi_Xu1",
+      "~Huan_Zhang1"
+    ],
+    "rank": 959
+  },
+  {
+    "url": "https://openreview.net/forum?id=J40FkbdldTX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      8,
+      5
+    ],
+    "rating": "5.88",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Recently presented benchmarks for Neural Architecture Search (NAS) provide the results of training thousands of different architectures in a specific search space, thus enabling the fair and rapid comparison of different methods.\nBased on these results, we quantify the ranking correlations of single-path architecture search methods\nin different search space subsets and under several training variations;\nstudying their impact on the expected search results.\nThe experiments support the few-shot approach and Linear Transformers,\nprovide evidence against disabling cell topology sharing during the training phase or using strong regularization in the NAS-Bench-201 search space,\nand show the necessity of further research regarding super-network size and path sampling strategies.",
+    "title": "Exploring single-path Architecture Search ranking correlations",
+    "authors": [
+      "Kevin Alexander Laube",
+      "Andreas Zell"
+    ],
+    "emails": [
+      "~Andreas_Zell1",
+      "~Kevin_Alexander_Laube1"
+    ],
+    "rank": 960
+  },
+  {
+    "url": "https://openreview.net/forum?id=cB_mXKTs9J",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      7
+    ],
+    "rating": "5.88",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Deep neural networks (DNNs) have been widely adopted in different applications to achieve state-of-the-art performance. However, they are often applied as a black box with limited understanding of what the model has learned from the data. In this paper, we focus on image classification and propose a method to visualize and understand the class-wise patterns learned by DNNs trained under three different settings including natural, backdoored and adversarial. Different from existing class-wise deep representation visualizations, our method searches for a single predictive pattern in the input (i.e. pixel) space for each class. Based on the proposed method, we show that DNNs trained on natural (clean) data learn abstract shapes along with some texture, and backdoored models learn a small but highly predictive pattern for the backdoor target class. Interestingly, the existence of class-wise predictive patterns in the input space indicates that even DNNs trained on clean data can have backdoors, and the class-wise patterns identified by our method can be readily applied to \u201cbackdoor\u201d attack the model. In the adversarial setting, we show that adversarially trained models learn more simplified shape patterns. Our method can serve as a useful tool to better understand DNNs trained on different datasets under different settings.",
+    "title": "What Do Deep Nets Learn? Class-wise Patterns Revealed in the Input Space",
+    "authors": [
+      "Shihao Zhao",
+      "Xingjun Ma",
+      "Yisen Wang",
+      "James Bailey",
+      "Bo Li",
+      "Yu-Gang Jiang"
+    ],
+    "emails": [
+      "~James_Bailey1",
+      "~Bo_Li19",
+      "~Xingjun_Ma1",
+      "~Yu-Gang_Jiang1",
+      "~Yisen_Wang1",
+      "~Shihao_Zhao1"
+    ],
+    "rank": 961
+  },
+  {
+    "url": "https://openreview.net/forum?id=0Jr4rjA6glk",
+    "ratings": [
+      7,
+      4,
+      7
+    ],
+    "rating": "5.88",
+    "confidences": [
+      3,
+      3,
+      2
+    ],
+    "abstract": "Differentially Private-SGD (DP-SGD) of Abadi et al. (2016) and its variations are the only known\nalgorithms for private training of large scale neural networks. This algorithm requires computation\nof per-sample gradients norms which is extremely slow and memory intensive in practice. In this\npaper, we present a new framework to design differentially private optimizers called DP-SGD-JL and\nDP-Adam-JL. Our approach uses Johnson\u2013Lindenstrauss (JL) projections to quickly approximate\nthe per-sample gradient norms without exactly computing them, thus making the training time and\nmemory requirements of our optimizers closer to that of their non-DP versions.\nOur algorithms achieve state-of-the-art privacy-vs-accuracy tradeoffs on MNIST and CIFAR10\ndatasets while being significantly faster. Unlike previous attempts to make DP-SGD faster which\nwork only on fully-connected or convolutional layers, our algorithms work for any network in a\nblack-box manner which is the main contribution of this paper. To illustrate this, on IMDb\ndataset, we train a Recurrent Neural Network (RNN) to achieve good privacy-vs-accuracy tradeoff,\nwhereas existing DP optimizers are either inefficient or inapplicable. On RNNs, our algorithms are\norders of magnitude faster than DP-SGD for large batch sizes.\nThe privacy analysis of our algorithms is more involved than DP-SGD, we use the recently proposed\nf-DP framework of Dong et al. (2019). In summary, we design new differentially private training\nalgorithms which are fast, achieve state-of-the-art privacy-vs-accuracy tradeoffs and generalize to all\nnetwork architectures.",
+    "title": "FAST DIFFERENTIALLY PRIVATE-SGD VIA JL PROJECTIONS",
+    "authors": [
+      "Zhiqi Bu",
+      "Sivakanth Gopi",
+      "Janardhan Kulkarni",
+      "Yin Tat Lee",
+      "Uthaipon Tantipongpipat"
+    ],
+    "emails": [
+      "~Janardhan_Kulkarni2",
+      "microsoft.com",
+      "~Zhiqi_Bu1",
+      "~Yin_Tat_Lee1",
+      "~Uthaipon_Tantipongpipat1"
+    ],
+    "rank": 962
+  },
+  {
+    "url": "https://openreview.net/forum?id=xpFFI_NtgpW",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      3,
+      3,
+      5
+    ],
+    "abstract": "We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that allocating additional capacity to the output embedding provides benefits to the model that persist through the fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the number of parameters at the fine-tuning stage. ",
+    "title": "Rethinking Embedding Coupling in Pre-trained Language Models",
+    "authors": [
+      "Hyung Won Chung",
+      "Thibault Fevry",
+      "Henry Tsai",
+      "Melvin Johnson",
+      "Sebastian Ruder"
+    ],
+    "emails": [
+      "~Melvin_Johnson1",
+      "~Sebastian_Ruder2",
+      "~Thibault_Fevry1",
+      "~Hyung_Won_Chung1",
+      "google.com"
+    ],
+    "rank": 963
+  },
+  {
+    "url": "https://openreview.net/forum?id=Gu5WqN9J3Fn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      7,
+      7
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Choosing the right representation for geometry is crucial for making 3D models compatible with existing applications. Focusing on piecewise-smooth man-made shapes, we propose a new representation that is usable in conventional CAD modeling pipelines and can also be learned by deep neural networks. We demonstrate its benefits by applying it to the task of sketch-based modeling. Given a raster image, our system infers a set of parametric surfaces that realize the input in 3D. To capture piecewise smooth geometry, we learn a special shape representation: a deformable parametric template composed of Coons patches. Naively training such a system, however, is hampered by non-manifold artifacts in the parametric shapes and by a lack of data. To address this, we introduce loss functions that bias the network to output non-self-intersecting shapes and implement them as part of a fully self-supervised system, automatically generating both shape templates and synthetic training data. We develop a testbed for sketch-based modeling, demonstrate shape interpolation, and provide comparison to related work.",
+    "title": "Learning Manifold Patch-Based Representations of Man-Made Shapes",
+    "authors": [
+      "Dmitriy Smirnov",
+      "Mikhail Bessmeltsev",
+      "Justin Solomon"
+    ],
+    "emails": [
+      "~Justin_Solomon1",
+      "~Mikhail_Bessmeltsev1",
+      "~Dmitriy_Smirnov1"
+    ],
+    "rank": 964
+  },
+  {
+    "url": "https://openreview.net/forum?id=U4XLJhqwNF1",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Contrastive learning has recently been a core for unsupervised visual representation learning. Without human annotation, the common practice is to perform an instance discrimination task: Given a query image crop, label crops from the same image as positives, and crops from other randomly sampled images as negatives. An important limitation of this label assignment is that it can not reflect the heterogeneous similarity of the query crop to crops from other images, but regarding them as equally negative. To address this issue, inspired by consistency regularization in semi-supervised learning, we propose Consistent Contrast (CO2), which introduces a consistency term into unsupervised contrastive learning framework. The consistency term takes the similarity of the query crop to crops from other images as unlabeled, and the corresponding similarity of a positive crop as a pseudo label. It then encourages consistency between these two similarities. Empirically, CO2 improves Momentum Contrast (MoCo) by 2.9% top-1 accuracy on ImageNet linear protocol, 3.8% and 1.1% top-5 accuracy on 1% and 10% labeled semi-supervised settings. It also transfers to image classification, object detection, and semantic segmentation on PASCAL VOC. This shows that CO2 learns better visual representations for downstream tasks.",
+    "title": "CO2: Consistent Contrast for Unsupervised Visual Representation Learning",
+    "authors": [
+      "Chen Wei",
+      "Huiyu Wang",
+      "Wei Shen",
+      "Alan Yuille"
+    ],
+    "emails": [
+      "~Chen_Wei2",
+      "~Alan_Yuille1",
+      "~Huiyu_Wang1",
+      "~Wei_Shen2"
+    ],
+    "rank": 965
+  },
+  {
+    "url": "https://openreview.net/forum?id=EVV259WQuFG",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "We propose two linguistic verifiers for span-extraction style machine reading comprehension to respectively tackle two challenges: how to evaluate the syntactic completeness of predicted answers and how to utilize the rich context of long documents. Our first verifier rewrites a question through replacing its interrogatives by the predicted answer phrases and then builds a cross-attention scorer between the rewritten question and the segment, so that the answer candidates are scored in a \\emph{position-sensitive} context. Our second verifier builds a hierarchical attention network to represent segments in a passage where neighbour segments in long passages are \\emph{recurrently connected} and can contribute to current segment-question pair's inference for answerablility classification and boundary determination. We then combine these two verifiers together into a pipeline and apply it to SQuAD2.0, NewsQA and TriviaQA benchmark sets. Our pipeline achieves significantly better improvements of both exact matching and F1 scores than state-of-the-art baselines.",
+    "title": "Machine Reading Comprehension with Enhanced Linguistic Verifiers",
+    "authors": [
+      "Xianchao Wu"
+    ],
+    "emails": [
+      "~Xianchao_Wu1"
+    ],
+    "rank": 966
+  },
+  {
+    "url": "https://openreview.net/forum?id=cu7IUiOhujH",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "State-of-the-art natural language understanding classification models follow two-stages: pre-training a large language model on an auxiliary task, and then fine-tuning the model on a task-specific labeled dataset using cross-entropy loss. However, the cross-entropy loss has several shortcomings that can lead to sub-optimal generalization and instability. Driven by the intuition that good generalization requires capturing the similarity between examples in one class and contrasting them with examples in other classes, we propose a supervised contrastive learning (SCL) objective for the fine-tuning stage. Combined with cross-entropy, our proposed SCL loss obtains significant improvements over a strong RoBERTa-Large baseline on multiple datasets of the GLUE benchmark in few-shot learning settings, without requiring specialized architecture, data augmentations, memory banks, or additional unsupervised data. Our proposed fine-tuning objective leads to models that are more robust to different levels of noise in the fine-tuning training data, and can generalize better to related tasks with limited labeled data.",
+    "title": "Supervised Contrastive Learning for Pre-trained Language Model Fine-tuning",
+    "authors": [
+      "Beliz Gunel",
+      "Jingfei Du",
+      "Alexis Conneau",
+      "Veselin Stoyanov"
+    ],
+    "emails": [
+      "~Veselin_Stoyanov2",
+      "~Jingfei_Du1",
+      "~Alexis_Conneau1",
+      "~Beliz_Gunel1"
+    ],
+    "rank": 967
+  },
+  {
+    "url": "https://openreview.net/forum?id=tJz_QUXB7C",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      4,
+      1,
+      3
+    ],
+    "abstract": "We propose FOSAE++, an unsupervised end-to-end neural system that generates a compact discrete state transition model (dynamics / action model) from raw visual observations. Our representation can be exported to Planning Domain Description Language (PDDL), allowing symbolic state-of-the-art classical planners to perform high-level task planning on raw observations. FOSAE++ expresses states and actions in First Order Logic (FOL), a superset of so-called object-centric representation. It is the first unsupervised neural system that fully supports FOL in PDDL action modeling, while existing systems are limited to continuous, propositional, or property-based representations, and/or require manually labeled input for actions/predicates/propositions.",
+    "title": "Generating Plannable Lifted Action Models for Visually Generated Logical Predicates",
+    "authors": [
+      "Masataro Asai"
+    ],
+    "emails": [
+      "~Masataro_Asai1"
+    ],
+    "rank": 968
+  },
+  {
+    "url": "https://openreview.net/forum?id=n1HD8M6WGn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.88",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Encoder layer fusion (EncoderFusion) is a technique to fuse all the encoder layers (instead of the uppermost layer) for sequence-to-sequence (Seq2Seq) models, which has proven effective on various NLP tasks. However, it is still not entirely clear why and when EncoderFusion should work. In this paper, our main contribution is to take a step further in understanding EncoderFusion. Many of previous studies believe that the success of EncoderFusion comes from exploiting surface and syntactic information embedded in lower encoder layers. Unlike them, we find that the encoder embedding layer is more important than other intermediate encoder layers. In addition, the uppermost decoder layer consistently pays more attention to the encoder embedding layer across NLP tasks. Based on this observation, we propose a simple fusion method, SurfaceFusion, by fusing only the encoder embedding layer for the softmax layer. Experimental results show that SurfaceFusion outperforms EncoderFusion on several NLP benchmarks, including machine translation, text summarization, and grammatical error correction. It obtains the state-of-the-art performance on WMT16 Romanian-English and WMT14 English-French translation tasks. Extensive analyses reveal that SurfaceFusion learns more expressive bilingual word embeddings by building a closer relationship between relevant source and target embeddings. Source code is freely available at <a href=\"https://github.com/SunbowLiu/SurfaceFusion\" target=\"_blank\" rel=\"nofollow\">https://github.com/SunbowLiu/SurfaceFusion</a>.",
+    "title": "Understanding and Improving Encoder Layer Fusion in Sequence-to-Sequence Learning",
+    "authors": [
+      "Xuebo Liu",
+      "Longyue Wang",
+      "Derek F. Wong",
+      "Liang Ding",
+      "Lidia S. Chao",
+      "Zhaopeng Tu"
+    ],
+    "emails": [
+      "~Liang_Ding3",
+      "~Longyue_Wang3",
+      "~Xuebo_Liu1",
+      "~Derek_F._Wong1",
+      "um.edu.mo",
+      "~Zhaopeng_Tu1"
+    ],
+    "rank": 969
+  },
+  {
+    "url": "https://openreview.net/forum?id=c77KhoLYSwF",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      8
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Data poisoning and backdoor attacks manipulate training data in order to cause models to fail during inference.  A recent survey of industry practitioners found that data poisoning is the number one concern among threats ranging from model stealing to adversarial attacks. However, we find that the impressive performance evaluations from data poisoning attacks are, in large part, artifacts of inconsistent experimental design. Moreover, we find that existing poisoning methods have been tested in contrived scenarios, and many fail in more realistic settings. In order to promote fair comparison in future work, we develop standardized benchmarks for data poisoning and backdoor attacks.",
+    "title": "Just How Toxic is Data Poisoning?  A Benchmark for Backdoor and Data Poisoning Attacks",
+    "authors": [
+      "Avi Schwarzschild",
+      "Micah Goldblum",
+      "Arjun Gupta",
+      "John P Dickerson",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~John_P_Dickerson1",
+      "~Avi_Schwarzschild1",
+      "~Arjun_Gupta2",
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1"
+    ],
+    "rank": 970
+  },
+  {
+    "url": "https://openreview.net/forum?id=Cnon5ezMHtu",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      8,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural Architecture Search (NAS) has been explosively studied to automate the discovery of top-performer neural networks. Current works require heavy training of supernet or intensive architecture evaluations, thus suffering from heavy resource consumption and often incurring search bias due to truncated training or approximations. Can we select the best neural architectures without involving any training and eliminate a drastic portion of the search cost? \n\nWe provide an affirmative answer, by proposing a novel framework called \\textit{training-free neural architecture search} (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D413 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D404 TEX-B\"></mjx-c><mjx-c class=\"mjx-c2D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D40D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D412 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">TE-NAS</mtext></math></mjx-assistive-mml></mjx-container>). TE-NAS ranks architectures by analyzing the spectrum of the neural tangent kernel (NTK), and the number of linear regions in the input space. Both are motivated by recent theory advances in deep networks, and can be computed without any training. We show that: (1) these two measurements imply the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">trainability</mtext></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">expressivity</mtext></math></mjx-assistive-mml></mjx-container> of a neural network; and (2) they strongly correlate with the network's actual test accuracy. Further on, we design a pruning-based NAS mechanism to achieve a more flexible and superior trade-off between the trainability and expressivity during the search. In NAS-Bench-201 and DARTS search spaces, TE-NAS completes high-quality search but only costs <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D7CE TEX-B\"></mjx-c><mjx-c class=\"mjx-c2E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D7D3 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">0.5</mtext></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D7D2 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">4</mtext></math></mjx-assistive-mml></mjx-container> GPU hours with one 1080Ti on CIFAR-10 and ImageNet, respectively. We hope our work to inspire more attempts in bridging between the theoretic findings of deep networks and practical impacts in real NAS applications.",
+    "title": "Neural Architecture Search on ImageNet in Four GPU Hours: A Theoretically Inspired Perspective",
+    "authors": [
+      "Wuyang Chen",
+      "Xinyu Gong",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Zhangyang_Wang1",
+      "~Xinyu_Gong1",
+      "~Wuyang_Chen1"
+    ],
+    "rank": 971
+  },
+  {
+    "url": "https://openreview.net/forum?id=cTbIjyrUVwJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In recent deep image compression neural networks, the entropy model plays a critical role in estimating the prior distribution of deep image encodings. Existing methods combine hyperprior with local context in the entropy estimation function. This greatly limits their performance due to the absence of a global vision. In this work, we propose a novel Global Reference Model for image compression to effectively leverage both the local and the global context information, leading to an enhanced compression rate. The proposed method scans decoded latents and then finds the most relevant latent to assist the distribution estimating of the current latent. A by-product of this work is the innovation of a mean-shifting GDN module that further improves the performance. Experimental results demonstrate that the proposed model outperforms the rate-distortion performance of most of the state-of-the-art methods in the industry.",
+    "title": "Learning Accurate Entropy Model with Global Reference for Image Compression",
+    "authors": [
+      "Yichen Qian",
+      "Zhiyu Tan",
+      "Xiuyu Sun",
+      "Ming Lin",
+      "Dongyang Li",
+      "Zhenhong Sun",
+      "Li Hao",
+      "Rong Jin"
+    ],
+    "emails": [
+      "~Xiuyu_Sun1",
+      "~Yichen_Qian1",
+      "~Rong_Jin1",
+      "~Ming_Lin4",
+      "~Li_Hao1",
+      "alibaba-inc.com"
+    ],
+    "rank": 972
+  },
+  {
+    "url": "https://openreview.net/forum?id=wZ4yWvQ_g2y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      6
+    ],
+    "rating": "5.88",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "While pre-trained language models such as BERT and RoBERTa have achieved impressive results on various natural language processing tasks, they have huge numbers of parameters and suffer from huge computational and memory costs, which make them difficult for real-world deployment. Hence, model compression should be performed in order to reduce the computation and memory cost of pre-trained models. In this work, we aim to compress BERT and address the following two challenging practical issues: (1) The compression algorithm should be able to output multiple compressed models with different sizes and latencies, so as to support devices with different kinds of memory and latency limitations; (2) the algorithm should be downstream task agnostic, so that the compressed models are generally applicable for different downstream tasks.  We leverage techniques in neural architecture search (NAS) and propose NAS-BERT, an efficient method for BERT compression. NAS-BERT trains a big supernet on a carefully designed search space containing various architectures and outputs multiple compressed models with adaptive sizes and latency. Furthermore, the training of NAS-BERT is conducted on standard self-supervised pre-training tasks (e.g., masked language model) and does not depend on specific downstream tasks. Thus, the models it produces can be used across various downstream tasks. The technical challenge of NAS-BERT is that training a big supernet on the pre-training task is extremely costly. We employ several techniques including block-wise search, search space pruning, and performance approximation to improve search efficiency and accuracy. Extensive experiments on GLUE benchmark datasets demonstrate that NAS-BERT can find lightweight models with better accuracy than previous approaches, and can be directly applied to different downstream tasks with adaptive model sizes for different requirements of memory or latency.",
+    "title": "Task-Agnostic and Adaptive-Size BERT Compression",
+    "authors": [
+      "Jin Xu",
+      "Xu Tan",
+      "Renqian Luo",
+      "Kaitao Song",
+      "Li Jian",
+      "Tao Qin",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Renqian_Luo1",
+      "~Xu_Tan1",
+      "~Tao_Qin1",
+      "~Tie-Yan_Liu1",
+      "~Kaitao_Song1",
+      "~Li_Jian1",
+      "~Jin_Xu5"
+    ],
+    "rank": 973
+  },
+  {
+    "url": "https://openreview.net/forum?id=sAzh_FTFDxz",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.87",
+    "confidences": [
+      4,
+      5,
+      4,
+      2
+    ],
+    "abstract": "Anomaly detection presents a unique challenge in machine learning, due to the scarcity of labeled anomaly data. Recent work attempts to mitigate such problems by augmenting training of deep anomaly detection models with additional labeled anomaly samples. However, the labeled data often does not align with the target distribution and introduces harmful bias to the trained model. In this paper, we aim to understand the effect of a biased anomaly set on anomaly detection. We formally state the anomaly detection problem as a supervised learning task, and focus on the anomaly detector\u2019s recall at a given false positive rate as the main performance metric. Given two different anomaly score functions, we formally define their difference in performance as the relative scoring bias of the anomaly detectors. Along this line, our work provides two key contributions. We establish the first finite sample rates for estimating the relative scoring bias for deep anomaly detection, and empirically validate our theoretical results on both synthetic and real-world datasets. We also provide extensive empirical study on how a biased training anomaly set affects the anomaly score function and therefore the detection performance on different anomaly classes. Our study demonstrates scenarios in which the biased anomaly set can be useful or problematic, and provides a solid benchmark for future research. ",
+    "title": "Understanding the Effect of Bias in Deep Anomaly Detection",
+    "authors": [
+      "Ziyu Ye",
+      "Yuxin Chen",
+      "Haitao Zheng"
+    ],
+    "emails": [
+      "~Yuxin_Chen1",
+      "~Haitao_Zheng2",
+      "~Ziyu_Ye1"
+    ],
+    "rank": 974
+  },
+  {
+    "url": "https://openreview.net/forum?id=5rc0K0ezhqI",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.87",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The Information Bottleneck principle offers both a mechanism to explain how deep neural networks train and generalize, as well as a regularized objective with which to train models. However, multiple competing objectives are proposed in the literature, and the information-theoretic quantities used in these objectives are difficult to compute for large deep neural networks, which in turn limits their use as a training objective. In this work, we review these quantities, compare and unify previously proposed objectives, which allows us to develop surrogate objectives more friendly to optimization without relying on cumbersome tools such as density estimation. We find that these surrogate objectives allow us to apply the information bottleneck to modern neural network architectures. We demonstrate our insights on MNIST, CIFAR-10 and ImageNette with modern DNN architectures (ResNets).",
+    "title": "Unpacking Information Bottlenecks: Surrogate Objectives for Deep Learning",
+    "authors": [
+      "Andreas Kirsch",
+      "Clare Lyle",
+      "Yarin Gal"
+    ],
+    "emails": [
+      "~Andreas_Kirsch1",
+      "~Clare_Lyle1",
+      "~Yarin_Gal1"
+    ],
+    "rank": 975
+  },
+  {
+    "url": "https://openreview.net/forum?id=nzpLWnVAyah",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      6,
+      6
+    ],
+    "rating": "5.87",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Fine-tuning pre-trained transformer-based language models such as BERT has become a common practice dominating leaderboards across various NLP benchmarks. Despite the strong empirical performance of fine-tuned models, fine-tuning is an unstable process: training the same model with multiple random seeds can result in a large variance of the task performance. Previous literature (Devlin et al., 2019; Lee et al., 2020; Dodge et al., 2020) identified two potential reasons for the observed instability: catastrophic forgetting and small size of the fine-tuning datasets. In this paper, we show that both hypotheses fail to explain the fine-tuning instability. We analyze BERT, RoBERTa, and ALBERT, fine-tuned on commonly used datasets from the GLUE benchmark, and show that the observed instability is caused by optimization difficulties that lead to vanishing gradients. Additionally, we show that the remaining variance of the downstream task performance can be attributed to differences in generalization where fine-tuned models with the same training loss exhibit noticeably different test performance. Based on our analysis, we present a simple but strong baseline that makes fine-tuning BERT-based models significantly more stable than the previously proposed approaches. Code to reproduce our results is available online: <a href=\"https://github.com/uds-lsv/bert-stable-fine-tuning\" target=\"_blank\" rel=\"nofollow\">https://github.com/uds-lsv/bert-stable-fine-tuning</a>.",
+    "title": "On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines",
+    "authors": [
+      "Marius Mosbach",
+      "Maksym Andriushchenko",
+      "Dietrich Klakow"
+    ],
+    "emails": [
+      "~Dietrich_Klakow1",
+      "~Maksym_Andriushchenko1",
+      "~Marius_Mosbach1"
+    ],
+    "rank": 976
+  },
+  {
+    "url": "https://openreview.net/forum?id=7ZJPhriEdRQ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.87",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Variational autoencoders (VAEs) often suffer from posterior collapse, which is a phenomenon that the learned latent space becomes uninformative. This is related to local optima introduced by a fixed hyperparameter resembling the data variance in the objective function. We suggest that this variance parameter regularizes the VAE and affects its smoothness, which is the magnitude of its gradient. An inappropriate choice of this parameter causes oversmoothness and leads to posterior collapse. This is shown theoretically by analysis on the linear approximated objective function and empirically in general cases. We propose AR-ELBO, which stands for adaptively regularized ELBO~(Evidence Lower BOund). It controls the strength of regularization by adapting the variance parameter, and thus avoids oversmoothing the model. Generation models trained by proposed objectives show improved Fr\u00e9chet inception distance~(FID) of images generated from the MNIST and CelebA datasets.",
+    "title": "AR-ELBO: Preventing Posterior Collapse Induced by Oversmoothing in Gaussian VAE",
+    "authors": [
+      "Yuhta Takida",
+      "Wei-Hsiang Liao",
+      "Toshimitsu Uesaka",
+      "Shusuke Takahashi",
+      "Yuki Mitsufuji"
+    ],
+    "emails": [
+      "~Yuhta_Takida1",
+      "sony.com",
+      "~Yuki_Mitsufuji1"
+    ],
+    "rank": 977
+  },
+  {
+    "url": "https://openreview.net/forum?id=GBjukBaBLXK",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      4,
+      8
+    ],
+    "rating": "5.87",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Deep learning has achieved state-of-the-art performance to generate high-quality prediction intervals (PIs) for uncertainty quantification in regression tasks. The high-quality criterion requires PIs to be as narrow as possible, whilst maintaining a pre-specified level of data (marginal) coverage. However, most existing works for high-quality PIs lack accurate information on conditional coverage, which may cause unreliable predictions if it is significantly smaller than the marginal coverage. To address this problem, we propose a novel end-to-end framework which could output high-quality PIs and simultaneously provide their conditional coverage estimation. In doing so, we design a new loss function that is both easy-to-implement and theoretically justified via an exponential concentration bound. Our evaluation on real-world benchmark datasets and synthetic examples shows that our approach not only outperforms the state-of-the-arts on high-quality PIs in terms of average PI width, but also accurately estimates conditional coverage information that is useful in assessing model uncertainty. ",
+    "title": "Conditional Coverage Estimation for High-quality Prediction Intervals",
+    "authors": [
+      "Ziyi Huang",
+      "Henry Lam",
+      "Haofeng Zhang"
+    ],
+    "emails": [
+      "~Henry_Lam2",
+      "~Ziyi_Huang1",
+      "~Haofeng_Zhang1"
+    ],
+    "rank": 978
+  },
+  {
+    "url": "https://openreview.net/forum?id=sRA5rLNpmQc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.87",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Adversarial attacks against deep networks can be defended against either by building robust classifiers or, by creating classifiers that can \\emph{detect} the presence of adversarial perturbations.  Although it may intuitively seem easier to simply detect attacks rather than build a robust classifier, this has not bourne out in practice even empirically, as most detection methods have subsequently been broken by adaptive attacks, thus necessitating \\emph{verifiable} performance for detection mechanisms.  In this paper, we propose a new method for jointly training a provably robust classifier and detector.  Specifically, we show that by introducing an additional \"abstain/detection\" into a classifier, we can modify existing certified defense mechanisms to allow the classifier to either robustly classify \\emph{or} detect adversarial attacks.  We extend the common interval bound propagation (IBP) method for certified robustness under <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> perturbations to account for our new robust objective, and show that the method outperforms traditional IBP used in isolation, especially for large perturbation sizes.  Specifically, tests on MNIST and CIFAR-10 datasets exhibit promising results, for example with provable robust error less than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>63.63</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>67.92</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>, for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>55.6</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>66.37</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> natural error, for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi><mo>=</mo><mn>8</mn><mrow><mo>/</mo></mrow><mn>255</mn></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>16</mn><mrow><mo>/</mo></mrow><mn>255</mn></math></mjx-assistive-mml></mjx-container> on the CIFAR-10 dataset, respectively.\n",
+    "title": "Provably robust classification of adversarial examples with detection",
+    "authors": [
+      "Fatemeh Sheikholeslami",
+      "Ali Lotfi",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Fatemeh_Sheikholeslami1",
+      "~Ali_Lotfi1",
+      "~J_Zico_Kolter1"
+    ],
+    "rank": 979
+  },
+  {
+    "url": "https://openreview.net/forum?id=ELiYxj9JlyW",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.87",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Examples that are close to the decision boundary\u2014that we term hard examples, are essential to shaping accurate classifiers.   Extracting confident examples has been widely studied in the community of learning with noisy labels.  However, it remains elusive how to extract hard confident examples from the noisy training data.  In this paper, we propose a deep learning paradigm to solve this problem, which is built on the memorization effect of deep neural networks that they would first learn simple patterns,  i.e.,  which are defined by these shared by multiple training examples. To extract hard confident examples that contain non-simple patterns and are entangled with the inaccurately labeled examples, we borrow the idea of momentum from physics. Specifically, we alternately update the confident examples and refine the classifier.  Note that the extracted confident examples in the previous round can be exploited to learn a better classifier and that the better classifier will help identify better (and hard) confident examples.  We call the approach the \u201cMomentum of Memorization\u201d (Me-Momentum). Empirical results on benchmark-simulated and real-world label-noise data illustrate the effectiveness of Me-Momentum for extracting hard confident examples, leading to better classification performance.",
+    "title": "ME-MOMENTUM:  EXTRACTING HARD CONFIDENT EXAMPLES FROM NOISILY LABELED DATA",
+    "authors": [
+      "Yingbin Bai",
+      "Tongliang Liu"
+    ],
+    "emails": [
+      "~Yingbin_Bai1",
+      "~Tongliang_Liu1"
+    ],
+    "rank": 980
+  },
+  {
+    "url": "https://openreview.net/forum?id=LucJxySuJcE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.87",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Several recent works have demonstrated highly effective model stealing (MS) attacks on Deep Neural Networks (DNNs) in black-box settings, even when the training data is unavailable. These attacks typically use some form of Out of Distribution (OOD) data to query the target model and use the predictions obtained to train a clone model. Such a clone model learns to approximate the decision boundary of the target model, achieving high accuracy on in-distribution examples. We propose Ensemble of Diverse Models (EDM) to defend against such MS attacks. EDM is made up of models that are trained to produce dissimilar predictions for OOD inputs. By using a different member of the ensemble to service different queries, our defense produces predictions that are highly discontinuous in the input space for the adversary's OOD queries. Such discontinuities cause the clone model trained on these predictions to have poor generalization on in-distribution examples. Our evaluations on several image classification tasks demonstrate that EDM defense can severely degrade the accuracy of clone models (up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>39.7</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>). Our defense has minimal impact on the target accuracy, negligible computational costs during inference, and is compatible with existing defenses for MS attacks.",
+    "title": "Protecting DNNs from Theft using an Ensemble of Diverse Models",
+    "authors": [
+      "Sanjay Kariyappa",
+      "Atul Prakash",
+      "Moinuddin K Qureshi"
+    ],
+    "emails": [
+      "~Moinuddin_K_Qureshi2",
+      "~Atul_Prakash1",
+      "~Sanjay_Kariyappa1"
+    ],
+    "rank": 981
+  },
+  {
+    "url": "https://openreview.net/forum?id=A7-rYAC-np1",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      8,
+      6
+    ],
+    "rating": "5.87",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We are far from having a complete mechanistic understanding of the brain computations involved in language processing and of the role that syntax plays in those computations.  Most language studies do not computationally model syntactic structure, and most studies that do model syntactic processing use effort-based metrics. These metrics capture the effort needed to process the syntactic information given by every word (Brennan et al., 2012; Hale et al., 2018; Brennan et al.,2016). They can reveal where in the brain syntactic processing occurs, but not what features of syntax are processed by different brain regions. Here, we move beyond effort-based metrics and propose explicit features capturing the syntactic structure that is incrementally built while a sentence is being read.  Using these features and functional Magnetic Resonance Imaging (fMRI) recordings of participants reading a natural text, we study the brain representation of syntax. We find that our syntactic structure-based features are better than effort-based metrics at predicting brain activity in various parts of the language system. We show evidence of the brain representation of complex syntactic information such as phrase and clause structures. We see that regions well-predicted by syntactic features are distributed in the language system and are not distinguishable from those processing semantics. Our results call for a shift in the approach used for studying syntactic processing.",
+    "title": "Syntactic representations in the human brain: beyond effort-based metrics",
+    "authors": [
+      "Aniketh Janardhan Reddy",
+      "Leila Wehbe"
+    ],
+    "emails": [
+      "~Leila_Wehbe1",
+      "cs.cmu.edu"
+    ],
+    "rank": 982
+  },
+  {
+    "url": "https://openreview.net/forum?id=M9hdyCNlWaf",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Bayesian neural networks and deep ensembles represent two modern paradigms of uncertainty quantification in deep learning. Yet these approaches struggle to scale mainly due to memory inefficiency issues, since they require parameter storage several times higher than their deterministic counterparts. To address this, we augment the weight matrix of each layer  with a small number of inducing weights, thereby projecting the uncertainty quantification into such low dimensional spaces. We further extend Matheron's conditional Gaussian sampling rule to enable fast weight sampling, whichenable our inference method to maintain reasonable run-time as compared with ensembles. Importantly, our approach achieves competitive performance to the state-of-the-art in prediction and uncertainty estimation tasks with fully connected neural networks and ResNets, while reducing the parameter size to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2264\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2264</mo><mn>47.9</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> of that of a single neural network. ",
+    "title": "Sparse Uncertainty Representation in Deep Learning with Inducing Weights",
+    "authors": [
+      "Hippolyt Ritter",
+      "Martin Kukla",
+      "Cheng Zhang",
+      "Yingzhen Li"
+    ],
+    "emails": [
+      "~Cheng_Zhang1",
+      "~Martin_Kukla1",
+      "~Yingzhen_Li1",
+      "~Hippolyt_Ritter1"
+    ],
+    "rank": 983
+  },
+  {
+    "url": "https://openreview.net/forum?id=arD29HCZG6O",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.86",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Representational sparsity is known to affect robustness to input perturbations in deep neural networks (DNNs), but less is known about how the semantic content of representations affects robustness. Class selectivity\u2014the variability of a unit\u2019s responses across data classes or dimensions\u2014is one way of quantifying the sparsity of semantic representations. Given recent evidence that class selectivity may not be necessary for, and in some cases can impair generalization, we sought to investigate whether it also confers robustness (or vulnerability) to perturbations of input data. We found that class selectivity leads to increased vulnerability to average-case (naturalistic) perturbations in ResNet18, ResNet50, and ResNet20, as measured using Tiny ImageNetC (ResNet18 and ResNet50) and CIFAR10C (ResNet20). Networks regularized to have lower levels of class selectivity are more robust to average-case perturbations, while networks with higher class selectivity are more vulnerable. In contrast, we found that class selectivity increases robustness to multiple types of worst-case (i.e. white box adversarial) perturbations, suggesting that while decreasing class selectivity is helpful for average-case perturbations, it is harmful for worst-case perturbations. To explain this difference, we studied the dimensionality of the networks' representations: we found that the dimensionality of early-layer representations is inversely proportional to a network's class selectivity, and that adversarial samples cause a larger increase in early-layer dimensionality than corrupted samples. We also found that the input-unit gradient was more variable across samples and units in high-selectivity networks compared to low-selectivity networks. These results lead to the conclusion that units participate more consistently in low-selectivity regimes compared to high-selectivity regimes, effectively creating a larger attack surface and hence vulnerability to worst-case perturbations.",
+    "title": "Linking average- and worst-case perturbation robustness via class selectivity and dimensionality",
+    "authors": [
+      "Matthew L Leavitt",
+      "Ari S. Morcos"
+    ],
+    "emails": [
+      "~Matthew_L_Leavitt1",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 984
+  },
+  {
+    "url": "https://openreview.net/forum?id=bQf4aGhfmFx",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      5,
+      7
+    ],
+    "rating": "5.86",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Loss-function metalearning can be used to discover novel, customized loss functions for deep neural networks, resulting in improved performance, faster training, and improved data utilization.  A likely explanation is that such functions discourage overfitting, leading to effective regularization. This paper theoretically demonstrates that this is indeed the case: decomposition of learning rules makes it possible to characterize the training dynamics and show that loss functions evolved through TaylorGLO regularize both in the beginning and end of learning, and maintain an invariant in between. The invariant can be utilized to make the metalearning process more efficient in practice, and the regularization can train networks that are robust against adversarial attacks. Loss-function optimization can thus be seen as a well-founded new aspect of metalearning in neural networks.",
+    "title": "Effective Regularization Through Loss-Function Metalearning",
+    "authors": [
+      "Santiago Gonzalez",
+      "Risto Miikkulainen"
+    ],
+    "emails": [
+      "~Santiago_Gonzalez1",
+      "~Risto_Miikkulainen1"
+    ],
+    "rank": 985
+  },
+  {
+    "url": "https://openreview.net/forum?id=pVwU-8cdjQQ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Unsupervised multi-object scene decomposition is a fast-emerging problem in representation learning. Despite significant progress in static scenes, such models are unable to leverage important dynamic cues present in video. We propose a novel spatio-temporal iterative inference framework that is powerful enough to jointly model complex multi-object representations and explicit temporal dependencies between latent variables across frames. This is achieved by leveraging 2D-LSTM, temporally conditioned inference and generation within the iterative amortized inference for posterior refinement. Our method improves the overall quality of decompositions, encodes information about the objects' dynamics, and can be used to predict trajectories of each object separately. Additionally, we show that our model has a high accuracy even without color information. We demonstrate the decomposition, segmentation, and prediction capabilities of our model and show that it outperforms the state-of-the-art on several benchmark datasets, one of which was curated for this work and will be made publicly available.",
+    "title": "Unsupervised Video Decomposition using Spatio-temporal Iterative Inference",
+    "authors": [
+      "Polina Zablotskaia",
+      "Edoardo Alberto Dominici",
+      "Leonid Sigal",
+      "Andreas Lehrmann"
+    ],
+    "emails": [
+      "~Andreas_Lehrmann1",
+      "~Polina_Zablotskaia1",
+      "~Edoardo_Alberto_Dominici1",
+      "~Leonid_Sigal2"
+    ],
+    "rank": 986
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oecm1tBcguW",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      4
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Bayesian deep learning is a promising approach towards improved uncertainty quantification and sample efficiency.\nDue to their complex parameter space, choosing informative priors for Bayesian Neural Networks (BNNs) is challenging. Thus, often a naive, zero-centered Gaussian is used, resulting both in bad generalization and poor uncertainty estimates when training data is scarce. In contrast, meta-learning aims to extract such prior knowledge from a set of related learning tasks. We propose a principled and scalable algorithm for meta-learning BNN priors based on PAC-Bayesian bounds. Whereas previous approaches require optimizing the prior and multiple variational posteriors in an interdependent manner, our method does not rely on difficult nested optimization problems and is agnostic to the variational inference method in use. Our experiments show that the proposed method is not only computationally more efficient but also yields better predictions and uncertainty estimates when compared to previous meta-learning methods and BNNs with standard priors.",
+    "title": "Meta-Learning Bayesian Neural Network Priors Based on PAC-Bayesian Theory",
+    "authors": [
+      "Jonas Rothfuss",
+      "Martin Josifoski",
+      "Andreas Krause"
+    ],
+    "emails": [
+      "~Andreas_Krause1",
+      "~Jonas_Rothfuss1",
+      "epfl.ch"
+    ],
+    "rank": 987
+  },
+  {
+    "url": "https://openreview.net/forum?id=RGeQOjc58d",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Neural network quantization has become increasingly popular due to efficient memory consumption and faster computation resulting from bitwise operations on the quantized networks. Even though they exhibit excellent generalization capabilities, their robustness properties are not well-understood. In this work, we systematically study the robustness of quantized networks against gradient based adversarial attacks and demonstrate that these quantized models suffer from gradient vanishing issues and show a fake sense of robustness. By attributing gradient vanishing to poor forward-backward signal propagation in the trained network, we introduce a simple temperature scaling approach to mitigate this issue while preserving the decision boundary. Despite being a simple modification to existing gradient based adversarial attacks, experiments on CIFAR-10/100 datasets with multiple network architectures demonstrate that our temperature scaled attacks obtain near-perfect success rate on quantized networks while outperforming original attacks on adversarially trained models as well as floating point networks.\n",
+    "title": "Improved Gradient based Adversarial Attacks for Quantized Networks",
+    "authors": [
+      "Kartik Gupta",
+      "Thalaiyasingam Ajanthan"
+    ],
+    "emails": [
+      "~Kartik_Gupta2",
+      "~Thalaiyasingam_Ajanthan1"
+    ],
+    "rank": 988
+  },
+  {
+    "url": "https://openreview.net/forum?id=cQzf26aA3vM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.86",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Black-box model-based optimization (MBO) problems, where the goal is to find a design input that maximizes an unknown objective function, are ubiquitous in a wide range of domains, such as the design of drugs, aircraft, and robot morphology. Typically, such problems are solved by actively querying the black-box objective on design proposals and using the resulting feedback to improve the proposed designs. However, when the true objective function is expensive or dangerous to evaluate in the real world, we might instead prefer a method that can optimize this function using only previously collected data, for example from a set of previously conducted experiments. This data-driven offline MBO set- ting presents a number of unique challenges, but a number of recent works have demonstrated that viable offline MBO methods can be developed even for high- dimensional problems, using high-capacity deep neural network function approximators. Unfortunately, the lack of standardized evaluation tasks in this emerg- ing new field has made tracking progress and comparing recent methods difficult. To address this problem, we present Design-Bench, a benchmark suite of offline MBO tasks with a unified evaluation protocol and reference implementations of recent methods. Our benchmark suite includes diverse and realistic tasks derived from real-world problems in biology, material science, and robotics that present distinct challenges for offline MBO methods. Our benchmarks, together with the reference implementations, are available at sites.google.com/view/design-bench. We hope that our benchmark can serve as a meaningful metric for the progress of offline MBO methods and guide future algorithmic development.",
+    "title": "Design-Bench: Benchmarks for Data-Driven Offline Model-Based Optimization",
+    "authors": [
+      "Brandon Trabucco",
+      "Aviral Kumar",
+      "Xinyang Geng",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Xinyang_Geng1",
+      "~Sergey_Levine1",
+      "~Aviral_Kumar2",
+      "~Brandon_Trabucco1"
+    ],
+    "rank": 989
+  },
+  {
+    "url": "https://openreview.net/forum?id=tu29GQT0JFy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "When a missing process depends on the missing values themselves, it needs to be explicitly modelled and taken into account while doing likelihood-based inference. We present an approach for building and fitting deep latent variable models (DLVMs) in cases where the missing process is dependent on the missing data. Specifically, a deep neural network enables us to flexibly model the conditional distribution of the missingness pattern given the data. This allows for incorporating prior information about the type of missingness (e.g.~self-censoring) into the model. Our inference technique, based on importance-weighted variational inference, involves maximising a lower bound of the joint likelihood. Stochastic gradients of the bound are obtained by using the reparameterisation trick both in latent space and data space. We show on various kinds of data sets and missingness patterns that explicitly modelling the missing process can be invaluable.",
+    "title": "not-MIWAE: Deep Generative Modelling with Missing not at Random Data",
+    "authors": [
+      "Niels Bruun Ipsen",
+      "Pierre-Alexandre Mattei",
+      "Jes Frellsen"
+    ],
+    "emails": [
+      "~Jes_Frellsen1",
+      "~Niels_Bruun_Ipsen1",
+      "~Pierre-Alexandre_Mattei3"
+    ],
+    "rank": 990
+  },
+  {
+    "url": "https://openreview.net/forum?id=6y3-wzlGHkb",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Recent work ties adversarial examples to existence of non-robust features: features which are susceptible to small perturbations and believed to be unintelligible to humans, but still useful for prediction. We study universal adversarial perturbations and demonstrate that the above picture is more nuanced. Specifically, even though universal perturbations---similarly to standard adversarial perturbations---do leverage non-robust features, these features tend to be fundamentally different from the ``standard'' ones and, in particular, non-trivially human-aligned. Namely, universal perturbations have more human-aligned locality and spatial invariance properties. However, we also show that these human-aligned non-robust features have much less predictive signal than general non-robust features. Our findings thus take a step towards improving our understanding of these previously unintelligible features.",
+    "title": "Non-robust Features through the Lens of Universal Perturbations",
+    "authors": [
+      "Sung Min Park",
+      "Kuo-An Wei",
+      "Kai Yuanqing Xiao",
+      "Jerry Li",
+      "Aleksander Madry"
+    ],
+    "emails": [
+      "~Sung_Min_Park2",
+      "~Kai_Yuanqing_Xiao1",
+      "~Aleksander_Madry1",
+      "mit.edu",
+      "~Jerry_Li1"
+    ],
+    "rank": 991
+  },
+  {
+    "url": "https://openreview.net/forum?id=rgFNuJHHXv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Recent improvements in generative adversarial visual synthesis incorporate real and fake image transformation in a self-supervised setting, leading to increased stability and perceptual fidelity. However, these approaches typically involve image augmentations via additional regularizers in the GAN objective and thus spend valuable network capacity towards approximating transformation equivariance instead of their desired task. In this work, we explicitly incorporate inductive symmetry priors into the network architectures via group-equivariant convolutional networks. Group-convolutions have higher expressive power with fewer samples and lead to better gradient feedback between generator and discriminator. We show that group-equivariance integrates seamlessly with recent techniques for GAN training across regularizers, architectures, and loss functions. We demonstrate the utility of our methods for conditional synthesis by improving generation in the limited data regime across symmetric imaging datasets and even find benefits for natural images with preferred orientation.",
+    "title": "Group Equivariant Generative Adversarial Networks",
+    "authors": [
+      "Neel Dey",
+      "Antong Chen",
+      "Soheil Ghafurian"
+    ],
+    "emails": [
+      "~Neel_Dey1",
+      "~Antong_Chen2",
+      "~Soheil_Ghafurian1"
+    ],
+    "rank": 992
+  },
+  {
+    "url": "https://openreview.net/forum?id=L7Irrt5sMQa",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.86",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) are effective models for representation learning on graph-structured data. However, standard GNNs are limited in their expressive power, as they cannot distinguish graphs beyond the capability of the Weisfeiler-Leman (1-WL) graph isomorphism heuristic. This limitation motivated a large body of work, including higher-order GNNs, which are provably more powerful models. To  date, higher-order invariant and equivariant networks are the only models with known universality results, but these results are practically hindered by prohibitive computational complexity. Thus, despite their limitations, standard GNNs are commonly used, due to their strong practical performance. In practice, GNNs have shown a promising performance when enhanced with random node initialization (RNI), where the idea is to train and run the models with randomized initial node features. In this paper, we analyze the expressive power of GNNs with RNI, and pose the following question: are GNNs with RNI more expressive than GNNs? We prove that this is indeed the case, by showing that GNNs with RNI are universal, a first such result for GNNs not relying on computationally demanding higher-order properties. We then empirically analyze the effect of RNI on GNNs, based on carefully constructed datasets. Our empirical findings support the superior performance of GNNs with RNI over standard GNNs.  In fact, we demonstrate that the performance of GNNs with RNI is often comparable with or better than that of higher-order GNNs, while keeping the much lower memory requirements of standard GNNs. However, this improvement typically comes at the cost of slower model convergence. Somewhat surprisingly, we found that the convergence rate and the accuracy of the models can be improved by using only a partial random initialization regime.",
+    "title": "The Surprising Power of Graph Neural Networks with Random Node Initialization",
+    "authors": [
+      "Ralph Abboud",
+      "Ismail Ilkan Ceylan",
+      "Martin Grohe",
+      "Thomas Lukasiewicz"
+    ],
+    "emails": [
+      "~Martin_Grohe1",
+      "~Thomas_Lukasiewicz2",
+      "~Ismail_Ilkan_Ceylan2",
+      "cs.ox.ac.uk"
+    ],
+    "rank": 993
+  },
+  {
+    "url": "https://openreview.net/forum?id=RmB-zwXOIVC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      8,
+      5
+    ],
+    "rating": "5.86",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose a new framework for Imitation Learning (IL) via density estimation of the expert's occupancy measure followed by Maximum Occupancy Entropy Reinforcement Learning (RL) using the density as a reward. Our approach maximizes a non-adversarial model-free RL objective that provably lower bounds reverse Kullback\u2013Leibler divergence between occupancy measures of the expert and imitator. We present a practical IL algorithm, Neural Density Imitation (NDI), which obtains state-of-the-art demonstration efficiency on benchmark control tasks. ",
+    "title": "Imitation with Neural Density Models",
+    "authors": [
+      "Kuno Kim",
+      "Akshat Jindal",
+      "Yang Song",
+      "Jiaming Song",
+      "Yanan Sui",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Yang_Song1",
+      "~Jiaming_Song1",
+      "cs.stanford.edu",
+      "~Yanan_Sui1",
+      "~Kuno_Kim1",
+      "~Stefano_Ermon1"
+    ],
+    "rank": 994
+  },
+  {
+    "url": "https://openreview.net/forum?id=RdhjoXl-SDG",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.86",
+    "confidences": [
+      3,
+      3,
+      1
+    ],
+    "abstract": "High-dimensional Bayesian inference problems cast a long-standing challenge in generating samples, especially when the posterior has multiple modes. For a wide class of Bayesian inference problems equipped with the multiscale structure that low-dimensional (coarse-scale) surrogate can approximate the original high-dimensional (fine-scale) problem well, we propose to train a Multiscale Invertible Generative Network (MsIGN) for sample generation. A novel prior conditioning layer is designed to bridge networks at different resolutions, enabling coarse-to-fine multi-stage training. Jeffreys divergence is adopted as the training objective to avoid mode dropping. On two high-dimensional Bayesian inverse problems, MsIGN approximates the posterior accurately and clearly captures multiple modes, showing superior performance compared with previous deep generative network approaches. On the natural image synthesis task, MsIGN achieves the superior performance in bits-per-dimension compared with our baseline models and yields great interpret-ability of its neurons in intermediate layers.",
+    "title": "Multiscale Invertible Generative Networks for High-Dimensional Bayesian Inference",
+    "authors": [
+      "Shumao Zhang",
+      "Thomas Hou",
+      "Pengchuan Zhang"
+    ],
+    "emails": [
+      "~Thomas_Hou2",
+      "~Shumao_Zhang1",
+      "~Pengchuan_Zhang1"
+    ],
+    "rank": 995
+  },
+  {
+    "url": "https://openreview.net/forum?id=wbQXW1XTq_y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Scene graph (SG) generation has been gaining a lot of traction recently. Current SG generation techniques, however, rely on the availability of expensive and limited number of labeled datasets. Synthetic data offers a viable alternative as labels are essentially free. However, neural network models trained on synthetic data, do not perform well on real data because of the domain gap.\nTo overcome this challenge, we propose Sim2SG, a scalable technique for sim-to-real transfer for scene graph generation. Sim2SG addresses the domain gap by decomposing it into appearance, label and prediction discrepancies between the two domains. We handle these discrepancies by introducing pseudo statistic based self-learning and adversarial techniques. Sim2SG does not require costly supervision from the real-world dataset.\nOur experiments demonstrate significant improvements over baselines in reducing the domain gap both qualitatively and quantitatively.\nWe validate our approach on toy simulators, as well as realistic simulators evaluated on real-world data.",
+    "title": "Sim2SG: Sim-to-Real Scene Graph Generation for Transfer Learning",
+    "authors": [
+      "Aayush Prakash",
+      "Shoubhik Debnath",
+      "Jean Francois Lafleche",
+      "Eric Cameracci",
+      "Gavriel State",
+      "Marc T Law"
+    ],
+    "emails": [
+      "nvidia.com",
+      "~Shoubhik_Debnath1",
+      "~Marc_T_Law1",
+      "~Aayush_Prakash1"
+    ],
+    "rank": 996
+  },
+  {
+    "url": "https://openreview.net/forum?id=iKQAk8a2kM0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.85",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "To explore the vulnerability of deep neural networks (DNNs), many attack paradigms have been well studied, such as the poisoning-based backdoor attack in the training stage and the adversarial attack in the inference stage. In this paper, we study a novel attack paradigm, which modifies model parameters in the deployment stage for malicious purposes. Specifically, our goal is to misclassify a specific sample into a target class without any sample modification, while not significantly reduce the prediction accuracy of other samples to ensure the stealthiness. To this end, we formulate this problem as a binary integer programming (BIP), since the parameters are stored as binary bits (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>i</mi><mo>.</mo><mi>e</mi><mo>.</mo></math></mjx-assistive-mml></mjx-container>, 0 and 1) in the memory. By utilizing the latest technique in integer programming, we equivalently reformulate this BIP problem as a continuous optimization problem, which can be effectively and efficiently solved using the alternating direction method of multipliers (ADMM) method. Consequently, the flipped critical bits can be easily determined through optimization, rather than using a heuristic strategy. Extensive experiments demonstrate the superiority of our method in attacking DNNs.",
+    "title": "Targeted Attack against Deep Neural Networks via Flipping Limited Weight Bits",
+    "authors": [
+      "Jiawang Bai",
+      "Baoyuan Wu",
+      "Yong Zhang",
+      "Yiming Li",
+      "Zhifeng Li",
+      "Shu-Tao Xia"
+    ],
+    "emails": [
+      "~Zhifeng_Li5",
+      "~Jiawang_Bai2",
+      "~Yong_Zhang6",
+      "~Shu-Tao_Xia1",
+      "~Baoyuan_Wu1",
+      "~Yiming_Li1"
+    ],
+    "rank": 997
+  },
+  {
+    "url": "https://openreview.net/forum?id=U850oxFSKmN",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Learning  continuous-time  stochastic  dynamics  is  a  fundamental  and  essential problem in modeling sporadic time series, whose observations are irregular and sparse in both time and dimension.  For a given system whose latent states and observed data are high-dimensional, it is generally impossible to derive a precise continuous-time  stochastic  process  to  describe  the  system  behaviors.  To  solve the above problem, we apply Variational Bayesian method and propose a flexible continuous-time stochastic recurrent neural network named Variational Stochastic Differential Networks (VSDN), which embed the complicated dynamics of thesporadic time series by neural Stochastic Differential Equations (SDE). VSDNs capture the stochastic dependency among latent states and observations by deep neural networks.  We also incorporate two differential Evidence Lower Bounds to efficiently  train  the  models. Through  comprehensive  experiments,  we  show  that VSDNs  outperform  state-of-the-art  continuous-time  deep  learning  models  and achieve remarkable performance on prediction and interpolation tasks for sporadic data.",
+    "title": "Learning Continuous-Time Dynamics by Stochastic Differential Networks",
+    "authors": [
+      "Yingru Liu",
+      "Yucheng Xing",
+      "Xuewen Yang",
+      "Xin Wang",
+      "Jing Shi",
+      "Di Jin",
+      "Zhaoyue Chen"
+    ],
+    "emails": [
+      "~Di_Jin1",
+      "~Jing_Shi1",
+      "~Yucheng_Xing1",
+      "~Xuewen_Yang1",
+      "~Xin_Wang7",
+      "~Zhaoyue_Chen1",
+      "~Yingru_Liu1"
+    ],
+    "rank": 998
+  },
+  {
+    "url": "https://openreview.net/forum?id=I4pQCAhSu62",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.85",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "It is generally believed that robust training of extremely large networks is critical to their success in real-world applications. However, when taken to the extreme, methods that promote robustness can hurt the model\u2019s sensitivity to rare or underrepresented patterns. In this paper, we discuss this trade-off between robustness and sensitivity by introducing two notions: contextual feature utility and contextual feature sensitivity. We propose Feature Contrastive Learning (FCL) that encourages the model to be more sensitive to the features that have higher contextual utility. Empirical results demonstrate that models trained with FCL achieve a better balance of robustness and sensitivity, leading to improved generalization in the presence of noise.",
+    "title": "Balancing Robustness and Sensitivity using Feature Contrastive Learning",
+    "authors": [
+      "Seungyeon Kim",
+      "Daniel Glasner",
+      "Srikumar Ramalingam",
+      "Cho-Jui Hsieh",
+      "Kishore Papineni",
+      "Sanjiv Kumar"
+    ],
+    "emails": [
+      "~Srikumar_Ramalingam2",
+      "~Cho-Jui_Hsieh1",
+      "~Daniel_Glasner2",
+      "~Seungyeon_Kim1",
+      "google.com",
+      "~Sanjiv_Kumar1"
+    ],
+    "rank": 999
+  },
+  {
+    "url": "https://openreview.net/forum?id=1GTma8HwlYp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      2,
+      3,
+      5
+    ],
+    "abstract": "While deep learning has been very beneficial in data-rich settings, tasks with smaller training set\noften resort to pre-training or multitask learning to leverage data from other tasks. In this case,\ncareful consideration is needed to select tasks and model parameterizations such that updates from\nthe auxiliary tasks actually help the primary task. We seek to alleviate this burden by formulating a model-agnostic framework that performs fine-grained manipulation of the auxiliary task gradients. We propose to decompose auxiliary updates into directions which help, damage or leave the primary task loss unchanged. This allows weighting the update directions \ndifferently depending on their impact on the problem of interest. We present a novel and efficient algorithm for that\npurpose and show its advantage in practice. Our method leverages efficient automatic differentiation \nprocedures and randomized singular value decomposition for scalability. We show that our framework is \ngeneric and encompasses some prior work as particular cases. Our approach consistently outperforms strong and widely used baselines when leveraging out-of-distribution data for Text and Image classification tasks.",
+    "title": "AUXILIARY TASK UPDATE DECOMPOSITION: THE GOOD, THE BAD AND THE NEUTRAL",
+    "authors": [
+      "Lucio M. Dery",
+      "Yann Dauphin",
+      "David Grangier"
+    ],
+    "emails": [
+      "~Yann_Dauphin1",
+      "~David_Grangier1",
+      "~Lucio_M._Dery1"
+    ],
+    "rank": 1000
+  },
+  {
+    "url": "https://openreview.net/forum?id=i7aMbliTkHs",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      4,
+      6
+    ],
+    "rating": "5.85",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Temporal modeling is crucial for capturing spatiotemporal structure in videos for action recognition. Video data is with extremely complex dynamics along its temporal dimension due to various factors such as camera motion, speed variation, and different activities. To effectively capture this diverse motion pattern, this paper presents a new temporal adaptive module ({\\bf TAM}) to generate video-specific kernels based on its own feature maps. TAM proposes a unique two-level adaptive modeling scheme by decoupling dynamic kernels into a location sensitive importance map and a location invariant aggregation weight. The importance map is learned in a local temporal window to capture short term information, while the aggregation weight is generated from a global view with a focus on long-term structure. TAM is a principled module and could be integrated into 2D CNNs to yield a powerful video architecture (TANet) with a very small extra computational cost. The extensive experiments on Kinetics-400 and Something-Something datasets, demonstrate that the TAM outperforms other temporal modeling methods consistently owing to its temporal adaptive modeling strategy.",
+    "title": "TAM: Temporal Adaptive Module for Video Recognition",
+    "authors": [
+      "Zhaoyang Liu",
+      "Limin Wang",
+      "Wayne Wu",
+      "Chen Qian",
+      "Tong Lu"
+    ],
+    "emails": [
+      "~Wayne_Wu1",
+      "~Limin_Wang1",
+      "~Tong_Lu1",
+      "~Zhaoyang_Liu1",
+      "~Chen_Qian1"
+    ],
+    "rank": 1001
+  },
+  {
+    "url": "https://openreview.net/forum?id=9l9WD4ahJgs",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep reinforcement learning (RL) agents often fail to generalize beyond their training environments. To alleviate this problem, recent work has proposed the use of data augmentation. However, different tasks tend to benefit from different types of augmentations and selecting the right one typically requires expert knowledge. In this paper, we introduce three approaches for automatically finding an effective augmentation for any RL task. These are combined with two novel regularization terms for the policy and value function, required to make the use of data augmentation theoretically sound for actor-critic algorithms. We evaluate our method on the Procgen benchmark which consists of 16 procedurally generated environments and show that it improves test performance by 40% relative to standard RL algorithms. Our approach also outperforms methods specifically designed to improve generalization in RL, thus setting a new state-of-the-art on Procgen. In addition, our agent learns policies and representations which are more robust to changes in the environment that are irrelevant for solving the task, such as the background. ",
+    "title": "Automatic Data Augmentation for Generalization in Reinforcement Learning",
+    "authors": [
+      "Roberta Raileanu",
+      "Maxwell Goldstein",
+      "Denis Yarats",
+      "Ilya Kostrikov",
+      "Rob Fergus"
+    ],
+    "emails": [
+      "~Rob_Fergus1",
+      "~Maxwell_Goldstein1",
+      "~Roberta_Raileanu2",
+      "~Ilya_Kostrikov1",
+      "~Denis_Yarats1"
+    ],
+    "rank": 1002
+  },
+  {
+    "url": "https://openreview.net/forum?id=NqPW1ZJjXDJ",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      7,
+      7
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Fine-tuning from pre-trained ImageNet models has been a simple, effective, and popular approach for various computer vision tasks. The common practice of fine-tuning is to adopt a default hyperparameter setting with a fixed pre-trained model, while both of them are not optimized for specific tasks and time constraints. Moreover, in cloud computing or GPU clusters where the tasks arrive sequentially in a stream, faster online fine-tuning is a more desired and realistic strategy for saving money, energy consumption, and CO2 emission. In this paper, we propose a joint Neural Architecture Search and Online Adaption framework named NASOA towards a faster task-oriented fine-tuning upon the request of users. Specifically, NASOA first adopts an offline NAS to identify a group of training-efficient networks to form a pretrained model zoo. We propose a novel joint block and macro-level search space to enable a flexible and efficient search. Then, by estimating fine-tuning performance via an adaptive model by accumulating experience from the past tasks, an online schedule generator is proposed to pick up the most suitable model and generate a personalized training regime with respect to each desired task in a one-shot fashion. The resulting model zoo is more training efficient than SOTA NAS models, e.g. 6x faster than RegNetY-16GF, and 1.7x faster than EfficientNetB3. Experiments on multiple datasets also show that NASOA achieves much better fine-tuning results, i.e. improving around 2.1% accuracy than the best performance in RegNet series under various time constraints and tasks; 40x faster compared to the BOHB method.",
+    "title": "NASOA: Towards Faster Task-oriented Online Fine-tuning",
+    "authors": [
+      "Hang Xu",
+      "Ning Kang",
+      "Gengwei Zhang",
+      "Xiaodan Liang",
+      "Zhenguo Li"
+    ],
+    "emails": [
+      "~Gengwei_Zhang1",
+      "huawei.com",
+      "~Hang_Xu1",
+      "~Xiaodan_Liang2",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 1003
+  },
+  {
+    "url": "https://openreview.net/forum?id=U_mat0b9iv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      4
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Recently, Frankle &amp; Carbin (2019) demonstrated that randomly-initialized dense networks contain subnetworks that once found can be trained to reach test accuracy comparable to the trained dense network. However, finding these high performing trainable subnetworks is expensive, requiring iterative process of training and pruning weights. In this paper, we propose (and prove) a stronger Multi-Prize Lottery Ticket Hypothesis:\n\nA sufficiently over-parameterized neural network with random weights contains several subnetworks (winning tickets) that (a) have comparable accuracy to a dense target network with learned weights (prize 1), (b) do not require any further training to achieve prize 1 (prize 2), and (c) is robust to extreme forms of quantization (i.e., binary weights and/or activation) (prize 3).\n\nThis provides a new paradigm for learning compact yet highly accurate binary neural networks simply by pruning and quantizing randomly weighted full precision neural networks. We also propose an algorithm for finding multi-prize tickets (MPTs) and test it by performing a series of experiments on CIFAR-10 and ImageNet datasets. Empirical results indicate that as models grow deeper and wider, multi-prize tickets start to reach similar (and sometimes even higher) test accuracy compared to their significantly larger and full-precision counterparts that have been weight-trained. Without ever updating the weight values, our MPTs-1/32 not only set new binary weight network state-of-the-art (SOTA) Top-1 accuracy -- 94.8% on CIFAR-10 and 74.03% on ImageNet -- but also outperform their full-precision counterparts by 1.78% and 0.76%, respectively. Further, our MPT-1/1 achieves SOTA Top-1 accuracy (91.9%) for binary neural networks on CIFAR-10. Code and pre-trained models are available at: <a href=\"https://github.com/chrundle/biprop\" target=\"_blank\" rel=\"nofollow\">https://github.com/chrundle/biprop</a>.",
+    "title": "Multi-Prize Lottery Ticket Hypothesis: Finding Accurate Binary Neural Networks by Pruning A Randomly Weighted Network",
+    "authors": [
+      "James Diffenderfer",
+      "Bhavya Kailkhura"
+    ],
+    "emails": [
+      "~Bhavya_Kailkhura1",
+      "~James_Diffenderfer1"
+    ],
+    "rank": 1004
+  },
+  {
+    "url": "https://openreview.net/forum?id=RrIqhkFEpec",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.85",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "High dimensional data is often assumed to be concentrated on or near a low-dimensional manifold. Autoencoders (AE) is a popular technique to learn representations of such data by pushing it through a neural network with a low dimension bottleneck while minimizing a reconstruction error. Using high capacity AE often leads to a large collection of minimizers, many of which represent a low dimensional manifold that fits the data well but generalizes poorly.\n\nTwo sources of bad generalization are: extrinsic, where the learned manifold possesses extraneous parts that are far from the data; and intrinsic, where the encoder and decoder introduce arbitrary distortion in the low dimensional parameterization. An approach taken to alleviate these issues is to add a regularizer that favors a particular solution; common regularizers promote sparsity, small derivatives, or robustness to noise.\n\nIn this paper, we advocate an isometry (i.e., local distance preserving) regularizer. Specifically, our regularizer encourages:  (i) the decoder to be an isometry; and (ii) the encoder to be the decoder\u2019s pseudo-inverse, that is, the encoder extends the inverse of the decoder to the ambient space by orthogonal projection.  In a nutshell, (i) and (ii) fix both intrinsic and extrinsic degrees of freedom and provide a non-linear generalization to principal component analysis (PCA). Experimenting with the isometry regularizer on dimensionality reduction tasks produces useful low-dimensional data representations.",
+    "title": "Isometric Autoencoders",
+    "authors": [
+      "Amos Gropp",
+      "Matan Atzmon",
+      "Yaron Lipman"
+    ],
+    "emails": [
+      "~Yaron_Lipman1",
+      "weizmann.ac.il",
+      "~Matan_Atzmon1"
+    ],
+    "rank": 1005
+  },
+  {
+    "url": "https://openreview.net/forum?id=oSrM_jG_Ng",
+    "decision": "Reject",
+    "ratings": [
+      9,
+      5,
+      4
+    ],
+    "rating": "5.85",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Attention is a powerful concept in computer vision. End-to-end networks that learn to focus selectively on regions of an image or video often perform strongly. However, other image regions, while not necessarily containing the signal of interest, may contain useful context. We present an approach that exploits the idea that statistics of noise may be shared between the regions that contain the signal of interest and those that do not. Our technique uses the inverse of an attention mask to generate a noise estimate that is then used to denoise temporal observations. We apply this to the task of camera-based physiological measurement. A convolutional attention network is used to learn which regions of a video contain the physiological signal and generate a preliminary estimate. A noise estimate is obtained by using the pixel intensities in the inverse regions of the learned attention mask, this in turn is used to refine the estimate of the physiological signal. We perform experiments on two large benchmark datasets and show that this approach produces state-of-the-art results, increasing the signal-to-noise ratio by up to 5.8 dB, reducing heart rate and breathing rate estimation error by as much as 30%, recovering subtle pulse waveform dynamics, and generalizing from RGB to NIR videos without retraining.",
+    "title": "The Benefit of Distraction: Denoising Remote Vitals Measurements Using Inverse Attention",
+    "authors": [
+      "Ewa Magdalena Nowara",
+      "Daniel McDuff",
+      "Ashok Veeraraghavan"
+    ],
+    "emails": [
+      "~Ashok_Veeraraghavan1",
+      "~Ewa_Magdalena_Nowara2",
+      "~Daniel_McDuff1"
+    ],
+    "rank": 1006
+  },
+  {
+    "url": "https://openreview.net/forum?id=8pz6GXZ3YT",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.85",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "The <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">lottery ticket hypothesis</mtext></math></mjx-assistive-mml></mjx-container> (LTH) states that learning on a properly pruned network (the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">winning ticket</mtext></math></mjx-assistive-mml></mjx-container>) has improved test accuracy over the originally unpruned network. Although LTH has been justified empirically in a broad range of deep neural network (DNN) involved applications like computer vision and natural language processing, the theoretical validation of the improved generalization of a winning ticket remains elusive. To the best of our knowledge, our work, for the first time, characterizes the performance of training a sparse neural network by analyzing the geometric structure of the objective function and the sample complexity to achieve zero generalization error. We show that the convex region near a desirable model with guaranteed generalization enlarges as the neural network model is pruned, indicating the structural importance of a winning ticket. Moreover, as the algorithm for training a sparse neural network is specified as (accelerated) stochastic gradient descent algorithm, we theoretically show that the number of samples required for achieving zero generalization error is proportional to the number of the non-pruned model weights in the hidden layer. With a fixed number of samples, training a pruned neural network enjoys a faster convergence rate to the desirable model than training the original unpruned one, providing a formal justification of the improved generalization of the winning ticket. Our theoretical results are acquired from learning a sparse neural network of one hidden layer, while experimental results are further provided to justify the implications in pruning multi-layer neural networks.",
+    "title": "Why Lottery Ticket Wins? A Theoretical Perspective of Sample Complexity on Sparse Neural Networks",
+    "authors": [
+      "Shuai Zhang",
+      "Meng Wang",
+      "Sijia Liu",
+      "Pin-Yu Chen",
+      "Jinjun Xiong"
+    ],
+    "emails": [
+      "~Meng_Wang4",
+      "~Shuai_Zhang6",
+      "~Pin-Yu_Chen1",
+      "~Sijia_Liu1",
+      "~Jinjun_Xiong1"
+    ],
+    "rank": 1007
+  },
+  {
+    "url": "https://openreview.net/forum?id=PuG6vCSbrV9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.83",
+    "confidences": [
+      3,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Normalizing Flows (NFs) are universal density estimators based on Neuronal Networks. However, this universality is limited: the density's support needs to be diffeomorphic to a Euclidean space. In this paper, we propose a novel method to overcome this limitation without sacrificing the universality. The proposed method inflates the data manifold by adding noise in the normal space, trains an NF on this inflated manifold and, finally, deflates the learned density. Our main result provides sufficient conditions on the manifold and the specific choice of noise under which the corresponding estimator is exact. Our method has the same computational complexity as NFs, and does not require to compute an inverse flow. We also show that, if the embedding dimension is much larger than the manifold dimension, noise in the normal space can be well approximated by some Gaussian noise. This allows using our method for approximating arbitrary densities on non-flat manifolds provided that the manifold dimension is known. ",
+    "title": "Density estimation on low-dimensional manifolds: an inflation-deflation approach",
+    "authors": [
+      "Christian Horvat"
+    ],
+    "emails": [
+      "~Christian_Horvat1"
+    ],
+    "rank": 1008
+  },
+  {
+    "url": "https://openreview.net/forum?id=2HLTMwxOxwe",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      8
+    ],
+    "rating": "5.83",
+    "confidences": [
+      4,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Machine learning models are often used in practice once they achieve good generalization results on in-distribution (ID) holdout data. To predict test sets in the wild, they should detect samples they cannot predict well. We show that current out-of-distribution (OOD) detection algorithms for neural networks produce unsatisfactory results in a variety of OOD detection scenarios, e.g.\\ when OOD data consists of unseen classes or corrupted measurements. This paper studies how such ``hard'' OOD scenarios can benefit from tuning the detection method after observing a batch of the test data. This \\emph{transductive} setting is relevant when the advantage of even a slightly delayed OOD detection outweighs the financial cost for additional tuning. We propose a novel method that uses an artificial labeling scheme for the test data and early stopping regularization to obtain ensembles of models that produce contradictory predictions only on the OOD samples in a test batch. We show via comprehensive experiments that our approach is indeed able to significantly outperform both inductive and transductive baselines on difficult OOD detection scenarios, such as unseen classes on CIFAR-10/CIFAR-100, severe corruptions (CIFAR-C), and strong covariate shift ImageNet vs ObjectNet.",
+    "title": "Learn what you can't learn: Regularized Ensembles for Transductive out-of-distribution detection",
+    "authors": [
+      "Alexandru \u021aifrea",
+      "Eric Petru Stavarache",
+      "Fanny Yang"
+    ],
+    "emails": [
+      "~Alexandru_\u021aifrea1",
+      "student.ethz.ch",
+      "~Fanny_Yang1"
+    ],
+    "rank": 1009
+  },
+  {
+    "url": "https://openreview.net/forum?id=awOrpNtsCX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.83",
+    "confidences": [
+      3,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We present Shape-Tailored Deep Neural Networks (ST-DNN). ST-DNN extend convolutional networks, which aggregate data from fixed shape (square) neighbor-hoods to compute descriptors, to be defined on arbitrarily shaped regions. This is useful for segmentation applications, where it is desired to have descriptors that aggregate data only within regions of segmentation to avoid mixing data from different regions, otherwise, the descriptors are difficult to group to a unique region.  We formulate these descriptors through partial differential equations (PDE) that naturally generalize convolution to arbitrary regions, and derive the methodology to jointly estimate the segmentation and ST-DNN descriptor. We also show that ST-DNN inherit covariance to translations and rotations from the PDE, a natural property of a segmentation method, which existing CNN based methods lack. ST-DNN are 3-4 order of magnitude smaller than typical CNN. We empirically show that they exceed segmentation performance compared to state-of-the-art CNN-based descriptors using 2-3 orders smaller training sets on the texture segmentation problem.",
+    "title": "Shape-Tailored Deep Neural Networks Using PDEs for Segmentation",
+    "authors": [
+      "Naeemullah Khan",
+      "Angira Sharma",
+      "Philip Torr",
+      "Ganesh Sundaramoorthi"
+    ],
+    "emails": [
+      "~Ganesh_Sundaramoorthi1",
+      "~Philip_Torr1",
+      "~Angira_Sharma1",
+      "~Naeemullah_Khan1"
+    ],
+    "rank": 1010
+  },
+  {
+    "url": "https://openreview.net/forum?id=M71R_ivbTQP",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.83",
+    "confidences": [
+      4,
+      3,
+      3,
+      2
+    ],
+    "abstract": "We study how to explain the main steps/chains of inference that a deep neural net (DNN) relies on to produce predictions in a local region of data space. This problem is related to network pruning and interpretable machine learning but the highlighted differences are: (1) fine-tuning of neurons/filters is forbidden: only exact copies are allowed; (2) we target an extremely high pruning rate, e.g., <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2265</mo><mn>95</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>; (3) the interpretation is for the whole inference process in a local region rather than for individual neurons/filters or on a single sample. In this paper, we introduce an efficient method, \\name, to extract the local inference chains by optimizing a differentiable sparse scoring for the filters and layers to preserve the outputs on given data from a local region. Thereby, \\name~can extract an extremely small sub-network composed of filters exactly copied from the original DNN by removing the filters/layers with small scores. We then visualize the sub-network by applying existing interpretation technique to the retained layer/filter/neurons and on any sample from the local region. Its architecture reveals how the inference process stitches and integrates the information layer by layer and filter by filter. We provide detailed and insightful case studies together with three quantitative analyses over thousands of trials to demonstrate the quality, sparsity, fidelity and accuracy of the interpretation within the assigned local regions and over unseen data. In our empirical study, \\name~significantly enriches the interpretation and makes the inner mechanism of DNNs more transparent than before. ",
+    "title": "Extract Local Inference Chains of Deep Neural Nets",
+    "authors": [
+      "Haiyan Zhao",
+      "Tianyi Zhou",
+      "Guodong Long",
+      "Jing Jiang",
+      "Chengqi Zhang"
+    ],
+    "emails": [
+      "~Guodong_Long2",
+      "~Tianyi_Zhou1",
+      "~Chengqi_Zhang1",
+      "~Jing_Jiang6",
+      "~Haiyan_Zhao2"
+    ],
+    "rank": 1011
+  },
+  {
+    "url": "https://openreview.net/forum?id=GvqjmSwUxkY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.82",
+    "confidences": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Every recent image-to-image translation model uses either image-level (i.e. input-output pairs) or set-level (i.e. domain labels) supervision at a minimum. However, even the set-level supervision can be a serious bottleneck for data collection in practice. In this paper, we tackle image-to-image translation in a fully unsupervised setting, i.e., neither paired images nor domain labels. To this end, we propose a truly unsupervised image-to-image translation model (TUNIT) that simultaneously learns to separate image domains and translate input images into the estimated domains. \nExperimental results show that our model achieves comparable or even better performance than the set-level supervised model trained with full labels, generalizes well on various datasets, and is robust against the choice of hyperparameters (e.g. the preset number of pseudo domains). In addition, TUNIT extends well to the semi-supervised scenario with various amount of labels provided. ",
+    "title": "Rethinking the Truly Unsupervised Image-to-Image Translation",
+    "authors": [
+      "Kyungjune Baek",
+      "Yunjey Choi",
+      "Youngjung Uh",
+      "Jaejun Yoo",
+      "Hyunjung Shim"
+    ],
+    "emails": [
+      "~Yunjey_Choi3",
+      "~Jaejun_Yoo1",
+      "~Hyunjung_Shim1",
+      "~Youngjung_Uh2",
+      "~Kyungjune_Baek1"
+    ],
+    "rank": 1012
+  },
+  {
+    "url": "https://openreview.net/forum?id=fw-BHZ1KjxJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      3,
+      7,
+      7,
+      7
+    ],
+    "rating": "5.82",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Dense embedding models are commonly deployed in commercial search engines, wherein all the document vectors are pre-computed, and near-neighbor search (NNS) is performed with the query vector to find relevant documents. However, the bottleneck of indexing a large number of dense vectors and performing an NNS hurts the query time and accuracy of these models. In this paper, we argue that high-dimensional and ultra-sparse embedding is a significantly superior alternative to dense low-dimensional embedding for both query efficiency and accuracy. Extreme sparsity eliminates the need for NNS by replacing them with simple lookups, while its high dimensionality ensures that the embeddings are informative even when sparse. However, learning extremely high dimensional embeddings leads to blow up in the model size. To make the training feasible, we propose a partitioning algorithm that learns such high dimensional embeddings across multiple GPUs without any communication. This is facilitated by our novel asymmetric mixture of Sparse, Orthogonal, Learned and Random (SOLAR) Embeddings. The label vectors are random, sparse, and near-orthogonal by design, while the query vectors are learned and sparse. We theoretically prove that our way of one-sided learning is equivalent to learning both query and label embeddings. With these unique properties, we can successfully train 500K dimensional SOLAR embeddings for the tasks of searching through 1.6M books and multi-label classification on the three largest public datasets. We achieve superior precision and recall compared to the respective state-of-the-art baselines for each task with up to 10 times faster speed.",
+    "title": "SOLAR: Sparse Orthogonal Learned and Random Embeddings",
+    "authors": [
+      "Tharun Medini",
+      "Beidi Chen",
+      "Anshumali Shrivastava"
+    ],
+    "emails": [
+      "~Anshumali_Shrivastava1",
+      "~Tharun_Medini1",
+      "~Beidi_Chen1"
+    ],
+    "rank": 1013
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZaYZfu8pT_N",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.81",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Although non-autoregressive models with one-iteration generation achieve remarkable inference speed-up, they still fall behind their autoregressive counterparts in prediction accuracy. The non-autoregressive models with the best accuracy currently rely on multiple decoding iterations, which largely sacrifice the inference speed of non-autoregressive models.  Inspired by the way of learning word dependencies in autoregressive and iterative-decoding models, we propose Glancing Transformer (GLAT) with a glancing language model (GLM), which learns to capture the word dependency gradually. Experiments on three benchmarks demonstrate that our approach can significantly improve the accuracy of non-autoregressive models without multiple decoding iterations. In particular, GLAT achieves state-of-the-art results among non-iterative models and even outperforms top iterative counterparts in some specific benchmarks.",
+    "title": "Non-iterative Parallel Text Generation via Glancing Transformer",
+    "authors": [
+      "Lihua Qian",
+      "Hao Zhou",
+      "Yu Bao",
+      "Mingxuan Wang",
+      "Lin Qiu",
+      "Weinan Zhang",
+      "Yong Yu",
+      "Lei Li"
+    ],
+    "emails": [
+      "~Yu_Bao1",
+      "~Mingxuan_Wang1",
+      "~Lihua_Qian1",
+      "~Yong_Yu1",
+      "bytedance.com",
+      "~Weinan_Zhang1",
+      "~Lei_Li11",
+      "apex.sjtu.edu.cn"
+    ],
+    "rank": 1014
+  },
+  {
+    "url": "https://openreview.net/forum?id=gDHCPUvKRP",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.81",
+    "confidences": [
+      5,
+      5,
+      3,
+      3
+    ],
+    "abstract": "A butterfly network consists of logarithmically many layers, each with a linear number of non-zero weights (pre-specified). The fast Johnson-Lindenstrauss transform (FJLT) can be represented as a butterfly network followed by a random projection to a subset of the coordinates. Moreover, a random matrix based on FJLT with high probability approximates the action of any matrix on a vector. Motivated by these facts, we propose to replace a dense linear layer in any neural network by an architecture based on the butterfly network. The proposed architecture significantly improves upon the quadratic number of weights required in a standard dense layer to nearly linear with little compromise in expressibility of the resulting operator. In a collection of wide variety of experiments, including supervised prediction on both the NLP and vision data, we show that this not only produces results that match and often outperform existing well-known architectures, but it also offers faster training and prediction in deployment. To understand the optimization problems posed by neural networks with a butterfly network, we study the optimization landscape of the encoder-decoder network, where the encoder is replaced by a butterfly network followed by a dense linear layer in smaller dimension. Theoretical result presented in the paper explain why the training speed and outcome are not compromised by our proposed approach. Empirically we demonstrate that the network performs as well as the encoder-decoder network.",
+    "title": "Sparse Linear Networks with a Fixed Butterfly Structure: Theory and Practice",
+    "authors": [
+      "Nir Ailon",
+      "Omer Leibovitch",
+      "Vineet Sreedharan Nair"
+    ],
+    "emails": [
+      "~Omer_Leibovitch1",
+      "~Vineet_Sreedharan_Nair1",
+      "~Nir_Ailon1"
+    ],
+    "rank": 1015
+  },
+  {
+    "url": "https://openreview.net/forum?id=ztMLindFLWR",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.81",
+    "confidences": [
+      4,
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Recently, the Weisfeiler-Lehman (WL) graph isomorphism test was used to measure the expressiveness of graph neural networks (GNNs), showing that the neighborhood aggregation GNNs were at most as powerful as 1-WL test in distinguishing graph structures. There were also improvements proposed in analogy to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-WL test (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi><mo>&gt;</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container>). However, the aggregators in these GNNs are far from injective as required by the WL test, and suffer from weak distinguishing strength, making it become expressive bottlenecks. In this paper, we improve the expressiveness by exploring powerful aggregators. We reformulate aggregation with the corresponding aggregation coefficient matrix, and then systematically analyze the requirements of the aggregation coefficient matrix for building more powerful aggregators and even injective aggregators. It can also be viewed as the strategy for preserving the rank of hidden features, and implies that basic aggregators correspond to a special case of low-rank transformations. We also show the necessity of applying nonlinear units ahead of aggregation, which is different from most aggregation-based GNNs. Based on our theoretical analysis, we develop two GNN layers, ExpandingConv and CombConv. Experimental results show that our models significantly boost performance, especially for large and densely connected graphs.",
+    "title": "Breaking the Expressive Bottlenecks of Graph Neural Networks",
+    "authors": [
+      "Mingqi Yang",
+      "Yanming Shen",
+      "Heng Qi",
+      "Baocai Yin"
+    ],
+    "emails": [
+      "~Baocai_Yin1",
+      "~Mingqi_Yang1",
+      "dlut.edu.cn"
+    ],
+    "rank": 1016
+  },
+  {
+    "url": "https://openreview.net/forum?id=kDnal_bbb-E",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.81",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "To successfully negotiate a deal, it is not enough to communicate fluently: pragmatic planning of persuasive negotiation strategies is essential. While modern dialogue agents excel at generating fluent sentences, they still lack pragmatic grounding and cannot reason strategically. We present DialoGraph, a negotiation system that incorporates pragmatic strategies in a negotiation dialogue using graph neural networks. DialoGraph explicitly incorporates dependencies between sequences of strategies to enable improved and interpretable prediction of next optimal strategies, given the dialogue context. Our graph-based method outperforms prior state-of-the-art negotiation models both in the accuracy of strategy/dialogue act prediction and in the quality of downstream dialogue response generation. We qualitatively show further benefits of learned strategy-graphs in providing explicit associations between effective negotiation strategies over the course of the dialogue, leading to interpretable and strategic dialogues.",
+    "title": "DialoGraph: Incorporating Interpretable Strategy-Graph Networks into Negotiation Dialogues",
+    "authors": [
+      "Rishabh Joshi",
+      "Vidhisha Balachandran",
+      "Shikhar Vashishth",
+      "Alan Black",
+      "Yulia Tsvetkov"
+    ],
+    "emails": [
+      "~Rishabh_Joshi1",
+      "~Shikhar_Vashishth1",
+      "~Vidhisha_Balachandran1",
+      "~Alan_Black1",
+      "~Yulia_Tsvetkov1"
+    ],
+    "rank": 1017
+  },
+  {
+    "url": "https://openreview.net/forum?id=ATgKbzY1UPh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.80",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Object recognition and viewpoint estimation lie at the heart of visual understanding. Recent works suggest that convolutional neural networks (CNNs) fail to generalize to category-viewpoint combinations not seen during training. However, it is unclear when and how such generalization may be possible. Does the number of combinations seen during training impact generalization? What architectures better enable generalization in the multi-task setting of simultaneous category and viewpoint classification? Furthermore, what are the underlying mechanisms that drive the network\u2019s generalization? In this paper, we answer these questions by analyzing state-of-the-art CNNs trained to classify both object category and 3D viewpoint, with quantitative control over the number of category-viewpoint combinations seen during training. We also investigate the emergence of two types of specialized neurons that can explain generalization to unseen combinations\u2014neurons selective to category and invariant to viewpoint, and vice versa. We perform experiments on MNIST extended with position or scale, the iLab dataset with vehicles at different viewpoints, and a challenging new dataset for car model recognition and viewpoint estimation that we introduce in this paper - the Biased-Cars dataset. Our results demonstrate that as the number of combinations seen during training increase, networks generalize better to unseen category-viewpoint combinations, facilitated by an increase in the selectivity and invariance of individual neurons. We find that learning category and viewpoint in separate networks compared to a shared one leads to an increase in selectivity and invariance, as separate networks are not forced to preserve information about both category and viewpoint. This enables separate networks to significantly outperform shared ones at classifying unseen category-viewpoint combinations.",
+    "title": "On the Capability of CNNs to Generalize to Unseen Category-Viewpoint Combinations",
+    "authors": [
+      "Spandan Madan",
+      "Timothy Henry",
+      "Jamell Arthur Dozier",
+      "Helen Ho",
+      "Nishchal Bhandari",
+      "Tomotake Sasaki",
+      "Fredo Durand",
+      "Hanspeter Pfister",
+      "Xavier Boix"
+    ],
+    "emails": [
+      "alum.mit.edu",
+      "~Xavier_Boix1",
+      "mit.edu",
+      "~Tomotake_Sasaki1",
+      "~Fredo_Durand1",
+      "~Hanspeter_Pfister1",
+      "~Spandan_Madan1",
+      "~Jamell_Arthur_Dozier1"
+    ],
+    "rank": 1018
+  },
+  {
+    "url": "https://openreview.net/forum?id=5tJMTHv0l8g",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      3
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "The best way of keeping a secret is to pretend there is not one. In this spirit, a class of techniques called steganography aims to hide secret messages on various media leaving as little detectable trace as possible. This paper considers neural networks as novel steganographic cover media, which we call stego networks, that can be used to hide one's secret messages. Although there have been numerous attempts to hide information in the output of neural networks, techniques for hiding information in the neural network parameters themselves have not been actively studied in the literature. The widespread use of deep learning models in various cloud computing platforms and millions of mobile devices as of today implies the importance of safety issues regarding stego networks among deep learning researchers and practitioners. In response, this paper presents the advantages of stego networks over other types of stego media in terms of security and capacity. We provide observations that the fraction bits of some typical network parameters in a floating-point representation tend to follow uniform distributions and explain how it can help a secret sender to encrypt messages that are indistinguishable from the original content. We demonstrate that network parameters can embed a large amount of secret information. Even the most significant fraction bits can be used for hiding secrets without inducing noticeable performance degradation while making it significantly hard to remove secrets by perturbing insignificant bits. Finally, we discuss possible use cases of stego networks and methods to detect or remove secrets from stego networks.",
+    "title": "Stego Networks: Information Hiding on Deep Neural Networks",
+    "authors": [
+      "Youngwoo Cho",
+      "Beomsoo Kim",
+      "Jaegul Choo"
+    ],
+    "emails": [
+      "~Youngwoo_Cho1",
+      "~Jaegul_Choo1",
+      "~Beomsoo_Kim2"
+    ],
+    "rank": 1019
+  },
+  {
+    "url": "https://openreview.net/forum?id=cT0jK5VvFuS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      8,
+      5
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We explore the effects of architecture and training objective choice on amortized posterior predictive inference in probabilistic conditional generative models.  We aim this work to be a counterpoint to a recent trend in the literature that stresses achieving good samples when the amount of conditioning data is large.  We instead focus our attention on the case where the amount of conditioning data is small.  We highlight specific architecture and objective choices that we find lead to qualitative and quantitative improvement to posterior inference in this low data regime.  Specifically we explore the effects of choices of pooling operator and variational family on posterior quality in neural processes.  Superior posterior predictive samples drawn from our novel neural process architectures are demonstrated via image completion/in-painting experiments.",
+    "title": "Uncertainty in Neural Processes",
+    "authors": [
+      "Saeid Naderiparizi",
+      "Kenny Chiu",
+      "Benjamin Bloem-Reddy",
+      "Frank Wood"
+    ],
+    "emails": [
+      "~Frank_Wood2",
+      "~Saeid_Naderiparizi1",
+      "stat.ubc.ca"
+    ],
+    "rank": 1020
+  },
+  {
+    "url": "https://openreview.net/forum?id=S6AtYQLzXOY",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Spatio-temporal convolution often fails to learn motion dynamics in videos and thus an effective motion representation is required for video understanding in the wild. In this paper, we propose a rich and robust motion representation method based on spatio-temporal self-similarity (STSS). Given a sequence of frames, STSS represents each local region as similarities to its neighbors in space and time. By converting appearance features into relational values, it enables the learner to better recognize structural patterns in space and time. We leverage the whole volume of STSS and let our model learn to extract an effective motion representation from it. \nThe proposed method is implemented as a neural block, dubbed SELFY, that can be easily inserted into neural architectures and learned end-to-end without additional supervision. With a sufficient volume of the neighborhood in space and time, it effectively captures long-term interaction and fast motion in the video, leading to robust action recognition.\nOur experimental analysis demonstrates its superiority over previous methods for motion modeling as well as its complementarity to spatio-temporal features from direct convolution. On the standard action recognition benchmarks, Something-Something-V1 &amp; V2, Diving-48, and FineGym, the proposed method achieves the state-of-the-art results.  ",
+    "title": "Learning Self-Similarity in Space and Time as a Generalized Motion for Action Recognition",
+    "authors": [
+      "Heeseung Kwon",
+      "Manjin Kim",
+      "Suha Kwak",
+      "Minsu Cho"
+    ],
+    "emails": [
+      "~Heeseung_Kwon2",
+      "~Manjin_Kim1",
+      "~Suha_Kwak3",
+      "~Minsu_Cho1"
+    ],
+    "rank": 1021
+  },
+  {
+    "url": "https://openreview.net/forum?id=szUsQ3NcQwV",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Real world multi-agent tasks often involve varying types and quantities of agents and non-agent entities; however, agents within these tasks rarely need to consider all others at all times in order to act effectively. Factored value function approaches have historically leveraged such independences to improve learning efficiency, but these approaches typically rely on domain knowledge to select fixed subsets of state features to include in each factor. We propose to utilize value function factoring with random subsets of entities in each factor as an auxiliary objective in order to disentangle value predictions from irrelevant entities. This factoring approach is instantiated through a simple attention mechanism masking procedure. We hypothesize that such an approach helps agents learn more effectively in multi-agent settings by discovering common trajectories across episodes within sub-groups of agents/entities. Our approach, Randomized Entity-wise Factorization for Imagined Learning (REFIL), outperforms all strong baselines by a significant margin in challenging StarCraft micromanagement tasks.",
+    "title": "Randomized Entity-wise Factorization for Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Shariq Iqbal",
+      "Christian Schroeder de Witt",
+      "Bei Peng",
+      "Wendelin Boehmer",
+      "Shimon Whiteson",
+      "Fei Sha"
+    ],
+    "emails": [
+      "~Shimon_Whiteson1",
+      "~Fei_Sha3",
+      "~Wendelin_Boehmer1",
+      "~Shariq_Iqbal1",
+      "~Christian_Schroeder_de_Witt1",
+      "~Bei_Peng2"
+    ],
+    "rank": 1022
+  },
+  {
+    "url": "https://openreview.net/forum?id=eEeyRrKVfbL",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.80",
+    "confidences": [
+      2,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Pruning is an approach to alleviate overparameterization of deep neural networks (DNN) by zeroing out or pruning DNN elements with little to no efficacy at a given task. In contrast to related works that do pruning before or after training, this paper presents a novel method to perform early pruning of DNN elements (e.g., neurons or convolutional filters) during the training process while preserving performance upon convergence. To achieve this, we model the future efficacy of DNN elements in a Bayesian manner conditioned upon efficacy data collected during the training and prune DNN elements which are predicted to have low efficacy after training completion. Empirical evaluations show that the proposed Bayesian early pruning improves the computational efficiency of DNN training with small sacrifices in performance. Using our approach we are able to achieve a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>48.6</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> faster training time for ResNet-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>50</mn></math></mjx-assistive-mml></mjx-container> on ImageNet to achieve a validation accuracy of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>72.5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>.",
+    "title": "Balancing training time vs. performance with Bayesian Early Pruning",
+    "authors": [
+      "Mohit Rajpal",
+      "Yehong Zhang",
+      "Bryan Kian Hsiang Low"
+    ],
+    "emails": [
+      "~Bryan_Kian_Hsiang_Low1",
+      "~Mohit_Rajpal1",
+      "~Yehong_Zhang1"
+    ],
+    "rank": 1023
+  },
+  {
+    "url": "https://openreview.net/forum?id=DM6KlL7GeB",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5
+    ],
+    "rating": "5.80",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "Network quantization, which aims to reduce the bit-lengths of the network weights and activations, has emerged as one of the key ingredients to reduce the size of neural networks for their deployments to resource-limited devices. In order to overcome the nature of transforming continuous activations and weights to discrete ones, recent study called Relaxed Quantization (RQ) [Louizos et al. 2019] successfully employ the popular Gumbel-Softmax that allows this transformation with efficient gradient-based optimization. However, RQ with this Gumbel-Softmax relaxation still suffers from bias-variance trade-off depending on the temperature parameter of Gumbel-Softmax. To resolve the issue, we propose a novel method, Semi-Relaxed Quantization (SRQ) that uses multi-class straight-through estimator to effectively reduce the bias and variance, along with a new regularization technique, DropBits that replaces dropout regularization to randomly drop the bits instead of neurons to further reduce the bias of the multi-class straight-through estimator in SRQ. As a natural extension of DropBits, we further introduce the way of learning heterogeneous quantization levels to find proper bit-length for each layer using DropBits. We experimentally validate our method on various benchmark datasets and network architectures, and also support the quantized lottery ticket hypothesis: learning heterogeneous quantization levels outperforms the case using the same but fixed quantization levels from scratch.",
+    "title": "Semi-Relaxed Quantization with DropBits: Training Low-Bit Neural Networks via Bitwise Regularization",
+    "authors": [
+      "Jung Hyun Lee",
+      "Jihun Yun",
+      "Sung Ju Hwang",
+      "Eunho Yang"
+    ],
+    "emails": [
+      "~Eunho_Yang1",
+      "~Jihun_Yun2",
+      "~Jung_Hyun_Lee1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 1024
+  },
+  {
+    "url": "https://openreview.net/forum?id=VcB4QkSfyO",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Several methods have been proposed in recent years to provide bounds on the Lipschitz constants of deep networks, which can be used to provide robustness guarantees, generalization bounds, and characterize the smoothness of decision boundaries. However, existing bounds get substantially weaker with increasing depth of the network, which makes it unclear how to apply such bounds to recently proposed models such as the deep equilibrium (DEQ) model, which can be viewed as representing an infinitely-deep network. In this paper, we show that monotone DEQs, a recently-proposed subclass of DEQs, have Lipschitz constants that can be bounded as a simple function of the strong monotonicity parameter of the network. We derive simple-yet-tight bounds on both the input-output mapping and the weight-output mapping defined by these networks, and demonstrate that they are small relative to those for comparable standard DNNs. We show that one can use these bounds to design monotone DEQ models, even with e.g. multi-scale convolutional structure, that still have constraints on the Lipschitz constant. We also highlight how to use these bounds to develop PAC-Bayes generalization bounds that do not depend on any depth of the network, and which avoid the exponential depth-dependence of comparable DNN bounds.",
+    "title": "Estimating Lipschitz constants of monotone deep equilibrium models",
+    "authors": [
+      "Chirag Pabbaraju",
+      "Ezra Winston",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Chirag_Pabbaraju1",
+      "~Ezra_Winston1",
+      "~J_Zico_Kolter1"
+    ],
+    "rank": 1025
+  },
+  {
+    "url": "https://openreview.net/forum?id=M3NDrHEGyyO",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We consider the problem of reinforcement learning when provided with (1) a baseline control policy and (2) a set of constraints that the controlled system must satisfy. The baseline policy can arise from a teacher agent, demonstration data or even a heuristic while the constraints might encode safety, fairness or other application-specific requirements. Importantly, the baseline policy may be sub-optimal for the task at hand, and is not guaranteed to satisfy the specified constraints. The key challenge therefore lies in effectively leveraging the baseline policy for faster learning, while still ensuring that the constraints are minimally violated. To reconcile these potentially competing aspects, we propose an iterative policy optimization algorithm that alternates between maximizing expected return on the task, minimizing distance to the baseline policy, and projecting the policy onto the constraint-satisfying set. We analyze the convergence of our algorithm theoretically and provide a finite-sample guarantee. In our empirical experiments on five different control tasks, our algorithm consistently outperforms several state-of-the-art methods, achieving 10 times fewer constraint violations and 40% higher reward on average.",
+    "title": "Accelerating Safe Reinforcement Learning with Constraint-mismatched Policies",
+    "authors": [
+      "Tsung-Yen Yang",
+      "Justinian Rosca",
+      "Karthik R Narasimhan",
+      "Peter Ramadge"
+    ],
+    "emails": [
+      "~Peter_Ramadge1",
+      "~Tsung-Yen_Yang2",
+      "siemens.com",
+      "~Karthik_R_Narasimhan1"
+    ],
+    "rank": 1026
+  },
+  {
+    "url": "https://openreview.net/forum?id=rRFIni1CYmy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      4
+    ],
+    "rating": "5.80",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Spatial memory, or the ability to remember and recall specific locations and objects, is central to autonomous agents' ability to carry out tasks in real environments. However, most existing artificial memory modules are not very adept at storing spatial information. We propose a parameter-free module, Egospheric Spatial Memory (ESM), which encodes the memory in an ego-sphere around the agent, enabling expressive 3D representations. ESM can be trained end-to-end via either imitation or reinforcement learning, and improves both training efficiency and final performance against other memory baselines on both drone and manipulator visuomotor control tasks. The explicit egocentric geometry also enables us to seamlessly combine the learned controller with other non-learned modalities, such as local obstacle avoidance. We further show applications to semantic segmentation on the ScanNet dataset, where ESM naturally combines image-level and map-level inference modalities. Through our broad set of experiments, we show that ESM provides a general computation graph for embodied spatial reasoning, and the module forms a bridge between real-time mapping systems and differentiable memory architectures. Implementation at: <a href=\"https://github.com/ivy-dl/memory\" target=\"_blank\" rel=\"nofollow\">https://github.com/ivy-dl/memory</a>.",
+    "title": "End-to-End Egospheric Spatial Memory",
+    "authors": [
+      "Daniel James Lenton",
+      "Stephen James",
+      "Ronald Clark",
+      "Andrew Davison"
+    ],
+    "emails": [
+      "~Ronald_Clark2",
+      "~Daniel_James_Lenton1",
+      "~Andrew_Davison1",
+      "~Stephen_James1"
+    ],
+    "rank": 1027
+  },
+  {
+    "url": "https://openreview.net/forum?id=KsN9p5qJN3",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.80",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Out-of-distribution (OOD) detection is essential to prevent anomalous inputs from causing a model to fail during deployment. Improved methods for OOD detection in multi-class classification have emerged, while OOD detection methods for multi-label classification remain underexplored and use rudimentary techniques. We propose SumEnergy, a simple and effective method, which estimates the OOD indicator scores by aggregating energy scores from multiple labels. We show that SumEnergy can be mathematically interpreted from a joint likelihood perspective. Our results show consistent improvement over previous methods that are based on the maximum-valued scores, which fail to capture joint information from multiple labels. We demonstrate the effectiveness of our method on three common multi-label classification benchmarks, including MS-COCO, PASCAL-VOC, and NUS-WIDE. We show that SumEnergy reduces the FPR95 by up to 10.05% compared to the previous best baseline, establishing state-of-the-art performance. ",
+    "title": "Energy-based Out-of-distribution Detection for Multi-label Classification",
+    "authors": [
+      "Haoran Wang",
+      "Weitang Liu",
+      "Alex Bocchieri",
+      "Yixuan Li"
+    ],
+    "emails": [
+      "~Alex_Bocchieri1",
+      "~Weitang_Liu1",
+      "~Yixuan_Li1",
+      "~Haoran_Wang5"
+    ],
+    "rank": 1028
+  },
+  {
+    "url": "https://openreview.net/forum?id=fhcMwjavKEZ",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      6,
+      7,
+      3
+    ],
+    "rating": "5.80",
+    "confidences": [
+      4,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks (GNNs) are emerging machine learning models on graphs. One key property behind the expressiveness of existing GNNs is that the learned node representations are permutation-equivariant. Though being a desirable property for certain tasks, however, permutation-equivariance prevents GNNs from being proximity-aware, i.e., preserving the walk-based proximities between pairs of nodes, which is another critical property for graph analytical tasks. On the other hand, some variants of GNNs are proposed to preserve node proximities, but they fail to maintain permutation-equivariance. How to empower GNNs to be proximity aware while maintaining permutation-equivariance remains an open problem. In this paper, we propose Stochastic Message Passing (SMP), a general and simple GNN to maintain both proximity-awareness and permutation-equivariance properties. Specifically, we augment the existing GNNs with stochastic node representations learned to preserve node proximities. Though seemingly simple, we prove that such a mechanism can enable GNNs to preserve node proximities in theory while maintaining permutation-equivariance with certain parametrization. Extensive experimental results demonstrate the effectiveness and efficiency of SMP for tasks including node classification and link prediction.",
+    "title": "A Simple and General Graph Neural Network with Stochastic Message Passing",
+    "authors": [
+      "Ziwei Zhang",
+      "Chenhao Niu",
+      "Peng Cui",
+      "Bo Zhang",
+      "Wei Cui",
+      "Wenwu Zhu"
+    ],
+    "emails": [
+      "tencent.com",
+      "mails.tsinghua.edu.cn",
+      "~Peng_Cui1",
+      "songshuai.com",
+      "~Wenwu_Zhu1",
+      "~Ziwei_Zhang1"
+    ],
+    "rank": 1029
+  },
+  {
+    "url": "https://openreview.net/forum?id=L4v_5Qtshj7",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      6,
+      6
+    ],
+    "rating": "5.80",
+    "confidences": [
+      5,
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Humans can effectively learn to estimate how close they are to completing a desired task simply by watching others fulfill the task. To solve the task, they can then take actions towards states with higher estimated proximity to the goal. From this intuition, we propose a simple yet effective method for imitation learning that learns a goal proximity function from expert demonstrations and online agent experience, and then uses the learned proximity to provide a dense reward signal for training a policy to solve the task. By predicting task progress as the temporal distance to the goal, the goal proximity function improves generalization to unseen states over methods that aim to directly imitate expert behaviors. We demonstrate that our proposed method efficiently learns a set of goal-driven tasks from state-only demonstrations in navigation, robotic arm manipulation, and locomotion tasks.",
+    "title": "Goal-Driven Imitation Learning from Observation by Inferring Goal Proximity",
+    "authors": [
+      "Andrew Szot",
+      "Youngwoon Lee",
+      "Shao-Hua Sun",
+      "Joseph J Lim"
+    ],
+    "emails": [
+      "~Shao-Hua_Sun1",
+      "~Andrew_Szot1",
+      "~Joseph_J_Lim1",
+      "~Youngwoon_Lee1"
+    ],
+    "rank": 1030
+  },
+  {
+    "url": "https://openreview.net/forum?id=F8lXvXpZdrL",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      7
+    ],
+    "rating": "5.80",
+    "confidences": [
+      3,
+      3,
+      2,
+      2
+    ],
+    "abstract": "Training neural networks with binary weights and activations is a challenging problem due to the lack of gradients and difficulty of optimization over discrete weights.\nMany successful experimental results have been achieved with empirical straight-through (ST) approaches, proposing a variety of ad-hoc rules for propagating gradients through non-differentiable activations and updating discrete weights. At the same time, ST methods can be truly derived as estimators in the stochastic binary network (SBN) model with Bernoulli weights. We advance these derivations to a more complete and systematic study. We analyze properties, estimation accuracy, obtain different forms of correct ST estimators for activations and weights, explain existing empirical approaches and their shortcomings, explain how latent weights arise from the mirror descent method when optimizing over probabilities. This allows to reintroduce, once empirical, ST methods as sound approximations, apply them with clarity and develop further improvements.",
+    "title": "Reintroducing Straight-Through Estimators as Principled Methods for Stochastic Binary Networks",
+    "authors": [
+      "Alexander Shekhovtsov",
+      "Viktor Yanush"
+    ],
+    "emails": [
+      "~Alexander_Shekhovtsov1",
+      "gmail.com"
+    ],
+    "rank": 1031
+  },
+  {
+    "url": "https://openreview.net/forum?id=I3zV6igAT9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.79",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recent works have shown that most deep learning models are often poorly  calibrated, i.e., they may produce overconfident\npredictions that are wrong, implying that their uncertainty estimates are unreliable. While a  number of approaches have been proposed recently to calibrate classification models, relatively little work exists on calibrating regression models. Isotonic Regression has recently been advocated for regression calibration. We provide a detailed formal analysis of the \\emph{side-effects} of Isotonic Regression when used for regression  calibration. To address this, we  recast quantile calibration as entropy estimation, and leverage this idea to construct a novel quantile regularizer, which can be used in any optimization based probabilisitc regression models. Unlike most of the existing approaches for calibrating regression models, which are based on \\emph{post-hoc} processing of the model's output, and require an additional dataset, our method is trainable in an end-to-end fashion, without requiring an additional dataset. We provide empirical results demonstrating that our approach improves  calibration for regression models trained on diverse architectures that  provide uncertainty estimates, such as Dropout VI, Deep Ensembles",
+    "title": "Quantile Regularization : Towards Implicit Calibration of Regression Models",
+    "authors": [
+      "Saiteja Utpala",
+      "Piyush Rai"
+    ],
+    "emails": [
+      "~Piyush_Rai1",
+      "~Saiteja_Utpala1"
+    ],
+    "rank": 1032
+  },
+  {
+    "url": "https://openreview.net/forum?id=EsA9Nr9JHvy",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In recent years, various notions of capacity and complexity have been proposed for characterizing the generalization properties of stochastic gradient descent (SGD) in deep learning. Some of the popular notions that correlate well with the performance on unseen data are (i) the 'flatness' of the local minimum found by SGD, which is related to the eigenvalues of the Hessian, (ii) the ratio of the stepsize <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b7</mi></math></mjx-assistive-mml></mjx-container> to the batch size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>b</mi></math></mjx-assistive-mml></mjx-container>, which essentially controls the magnitude of the stochastic gradient noise, and (iii) the 'tail-index', which measures the heaviness of the tails of the network weights at convergence. In this paper, we argue that these three seemingly unrelated perspectives for generalization are deeply linked to each other. We claim that depending on the structure of the Hessian of the loss at the minimum, and the choices of the algorithm parameters <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b7</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>b</mi></math></mjx-assistive-mml></mjx-container>, the SGD iterates will converge to a \\emph{heavy-tailed} stationary distribution. We rigorously prove this claim in the setting of quadratic optimization: we show that even in a simple linear regression problem with independent and identically distributed Gaussian data, the iterates can be heavy-tailed with infinite variance. We further characterize the behavior of the tails with respect to algorithm parameters, the dimension, and the curvature. We then translate our results into insights about the behavior of SGD in deep learning. We finally support our theory with experiments conducted on both synthetic data and fully connected neural networks.",
+    "title": "The Heavy-Tail Phenomenon in SGD",
+    "authors": [
+      "Mert Gurbuzbalaban",
+      "Umut Simsekli",
+      "Lingjiong Zhu"
+    ],
+    "emails": [
+      "~Umut_Simsekli1",
+      "~Lingjiong_Zhu1",
+      "~Mert_Gurbuzbalaban1"
+    ],
+    "rank": 1033
+  },
+  {
+    "url": "https://openreview.net/forum?id=g6OrH2oT5so",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.79",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "When expert supervision is available, practitioners often use imitation learning with varying degrees of success. We show that when an expert has access to privileged information that is unavailable to the student, this information is marginalized in the student policy during imitation learning resulting in an ''imitation gap'' and, potentially, poor results. Prior work bridges this gap via a progression from imitation learning to reinforcement learning. While often successful, gradual progression fails for tasks that require frequent switches between exploration and memorization skills. To better address these tasks and alleviate the imitation gap we propose 'Adaptive Insubordination' (ADVISOR), which dynamically weights imitation and reward-based reinforcement learning losses during training, enabling switching between imitation and exploration. On a suite of challenging didactic and MiniGrid tasks, we show that ADVISOR outperforms pure imitation, pure reinforcement learning, as well as their sequential and parallel combinations.",
+    "title": "Bridging the Imitation Gap by Adaptive Insubordination",
+    "authors": [
+      "Luca Weihs",
+      "Unnat Jain",
+      "Jordi Salvador",
+      "Svetlana Lazebnik",
+      "Aniruddha Kembhavi",
+      "Alex Schwing"
+    ],
+    "emails": [
+      "~Alex_Schwing1",
+      "~Unnat_Jain1",
+      "~Jordi_Salvador3",
+      "~Luca_Weihs1",
+      "~Aniruddha_Kembhavi1",
+      "~Svetlana_Lazebnik1"
+    ],
+    "rank": 1034
+  },
+  {
+    "url": "https://openreview.net/forum?id=W3Wf_wKmqm9",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.79",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Multi-goal reaching is an important problem in reinforcement learning needed to achieve algorithmic generalization. Despite recent advances in this field, current algorithms suffer from three major challenges: high sample complexity, learning only a single way of reaching the goals,  and difficulties in solving complex motion planning tasks. In order to address these limitations, we introduce the concept of cumulative accessibility functions, which measure the reachability of a goal from a given state within a specified horizon. We show that these functions obey a recurrence relation, which enables learning from offline interactions. We also prove that optimal cumulative accessibility functions are monotonic in the planning horizon. Additionally, our method can trade off speed and reliability in goal-reaching by suggesting multiple paths to a single goal depending on the provided horizon. We evaluate our approach on a set of multi-goal discrete and continuous control tasks. We show that our method outperforms state-of-the-art goal-reaching algorithms in success rate, sample complexity, and path optimality. Our code is available at <a href=\"https://github.com/layer6ai-labs/CAE\" target=\"_blank\" rel=\"nofollow\">https://github.com/layer6ai-labs/CAE</a>, and additional visualizations can be found at <a href=\"https://sites.google.com/view/learning-cae/\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/learning-cae/</a>.",
+    "title": "C-Learning: Horizon-Aware Cumulative Accessibility Estimation",
+    "authors": [
+      "Panteha Naderian",
+      "Gabriel Loaiza-Ganem",
+      "Harry J. Braviner",
+      "Anthony L. Caterini",
+      "Jesse C. Cresswell",
+      "Tong Li",
+      "Animesh Garg"
+    ],
+    "emails": [
+      "~Animesh_Garg1",
+      "~Anthony_L._Caterini1",
+      "~Gabriel_Loaiza-Ganem1",
+      "layer6.ai"
+    ],
+    "rank": 1035
+  },
+  {
+    "url": "https://openreview.net/forum?id=4emQEegFhSy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we consider intrinsic reward generation for sparse-reward reinforcement learning based on model prediction errors. In typical model-prediction-error-based intrinsic reward generation, an agent has a learning model for the underlying environment. Then intrinsic reward is designed as the error between the model prediction and the actual outcome of the environment, based on the fact that for less-visited or non-visited states, the learned model yields larger prediction errors, promoting exploration helpful for reinforcement learning. This paper generalizes this model-prediction-error-based intrinsic reward generation method to multiple prediction models. We propose a new adaptive fusion method relevant to the multiple-model case, which learns optimal prediction-error fusion across the learning phase to enhance the overall learning performance.  Numerical results show that for representative locomotion tasks, the proposed intrinsic reward generation method outperforms most of the previous methods, and the gain is significant in some tasks.",
+    "title": "Adaptive Multi-model Fusion Learning for Sparse-Reward Reinforcement Learning",
+    "authors": [
+      "Giseung Park",
+      "Whiyoung Jung",
+      "Sungho Choi",
+      "Youngchul Sung"
+    ],
+    "emails": [
+      "~Whiyoung_Jung1",
+      "~Giseung_Park1",
+      "~Youngchul_Sung1",
+      "~Sungho_Choi1"
+    ],
+    "rank": 1036
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ua5yGJhfgAg",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we tackle the problem of learning control policies for tasks when provided with constraints in natural language.  In contrast to instruction following, language here is used not to specify goals, but rather to describe situations that an agent must avoid during its exploration of the environment. Specifying constraints in natural language also differs from the predominant paradigm in safe reinforcement learning, where safety criteria are enforced by hand-defined cost functions. While natural language allows for easy and flexible specification of safety constraints and budget limitations, its ambiguous nature presents a challenge when mapping these specifications into representations that can be used by techniques for safe reinforcement learning. To address this, we develop a model that contains two components: (1) a constraint interpreter to encode natural language constraints into vector representations capturing spatial and temporal information on forbidden states, and (2) a policy network that uses these representations to output a policy with minimal constraint violations. Our model is end-to-end differentiable and we train it using a recently proposed algorithm for constrained policy optimization. To empirically demonstrate the effectiveness of our approach, we create a new benchmark task for autonomous navigation with crowd-sourced free-form text specifying three different types of constraints. Our method outperforms several baselines by achieving 6-7 times higher returns and 76% fewer constraint violations on average. Dataset and code to reproduce our experiments are available at <a href=\"https://sites.google.com/view/polco-hazard-world/\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/polco-hazard-world/</a>.",
+    "title": "Safe Reinforcement Learning with Natural Language Constraints",
+    "authors": [
+      "Tsung-Yen Yang",
+      "Michael Hu",
+      "Yinlam Chow",
+      "Peter Ramadge",
+      "Karthik R Narasimhan"
+    ],
+    "emails": [
+      "~Yinlam_Chow1",
+      "~Michael_Hu1",
+      "~Peter_Ramadge1",
+      "~Karthik_R_Narasimhan1",
+      "~Tsung-Yen_Yang2"
+    ],
+    "rank": 1037
+  },
+  {
+    "url": "https://openreview.net/forum?id=QxQkG-gIKJM",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      7,
+      6,
+      6
+    ],
+    "rating": "5.79",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Optimism in the face of uncertainty is a principled approach for provably efficient exploration for reinforcement learning in tabular and linear settings. However, such an approach is challenging in developing practical exploration algorithms for Deep Reinforcement Learning (DRL). To address this problem, we propose an Optimistic Exploration algorithm with Backward Bootstrapped Bonus (OEB3) for DRL by following these two principles. OEB3 is built on bootstrapped deep <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-learning, a non-parametric posterior sampling method for temporally-extended exploration. Based on such a temporally-extended exploration, we construct an UCB-bonus indicating the uncertainty of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-functions. The UCB-bonus is further utilized to estimate an optimistic <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-value, which encourages the agent to explore the scarcely visited states and actions to reduce uncertainty. In the estimation of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-function, we adopt an episodic backward update strategy to propagate the future uncertainty to the estimated <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-function consistently. Extensive evaluations show that OEB3 outperforms several state-of-the-art exploration approaches in Mnist maze and 49 Atari games.",
+    "title": "Optimistic Exploration with Backward Bootstrapped Bonus for Deep Reinforcement Learning",
+    "authors": [
+      "Chenjia Bai",
+      "Lingxiao Wang",
+      "Peng Liu",
+      "Zhaoran Wang",
+      "Jianye HAO",
+      "Yingnan Zhao"
+    ],
+    "emails": [
+      "~Chenjia_Bai2",
+      "~Yingnan_Zhao1",
+      "~Peng_Liu5",
+      "~Jianye_HAO1",
+      "~Zhaoran_Wang1",
+      "~Lingxiao_Wang6"
+    ],
+    "rank": 1038
+  },
+  {
+    "url": "https://openreview.net/forum?id=chPj_I5KMHG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      6,
+      7
+    ],
+    "rating": "5.79",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "We are interested in the autonomous acquisition of repertoires of skills. Language-conditioned reinforcement learning (LC-RL) approaches are great tools in this quest, as they allow to express abstract goals as sets of constraints on the states. However, most LC-RL agents are not autonomous and cannot learn without external instructions and feedback. Besides, their direct language condition cannot account for the goal-directed behavior of pre-verbal infants and strongly limits the expression of behavioral diversity for a given language input. To resolve these issues, we propose a new conceptual approach to language-conditioned RL: the Language-Goal-Behavior architecture (LGB). LGB decouples skill learning and language grounding via an intermediate semantic representation of the world. To showcase the properties of LGB, we present a specific implementation called DECSTR. DECSTR is an intrinsically motivated learning agent endowed with an innate semantic representation describing spatial relations between physical objects. In a first stage G -&gt; B, it freely explores its environment and targets self-generated semantic configurations. In a second stage (L -&gt; G), it trains a language-conditioned  goal generator to generate semantic goals that match the constraints expressed in language-based inputs. We showcase the additional properties of LGB w.r.t. both an end-to-end LC-RL approach and a similar approach leveraging non-semantic, continuous intermediate representations. Intermediate semantic representations help satisfy language commands in a diversity of ways, enable strategy switching after a failure and facilitate language grounding.",
+    "title": "Grounding Language to Autonomously-Acquired Skills via Goal Generation",
+    "authors": [
+      "Ahmed Akakzia",
+      "C\u00e9dric Colas",
+      "Pierre-Yves Oudeyer",
+      "Mohamed CHETOUANI",
+      "Olivier Sigaud"
+    ],
+    "emails": [
+      "~Mohamed_CHETOUANI2",
+      "~Pierre-Yves_Oudeyer1",
+      "~Ahmed_Akakzia1",
+      "~Olivier_Sigaud1",
+      "~C\u00e9dric_Colas1"
+    ],
+    "rank": 1039
+  },
+  {
+    "url": "https://openreview.net/forum?id=fylclEqgvgd",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.78",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Unsupervised contact prediction is central to uncovering physical, structural, and functional constraints for protein structure determination and design. For decades, the predominant approach has been to infer evolutionary constraints from a set of related sequences. In the past year, protein language models have emerged as a potential alternative, but performance has fallen short of state-of-the-art approaches in bioinformatics. In this paper we demonstrate that Transformer attention maps learn contacts from the unsupervised language modeling objective. We find the highest capacity models that have been trained to date already outperform a state-of-the-art unsupervised contact prediction pipeline, suggesting these pipelines can be replaced with a single forward pass of an end-to-end model.",
+    "title": "Transformer protein language models are unsupervised structure learners",
+    "authors": [
+      "Roshan Rao",
+      "Joshua Meier",
+      "Tom Sercu",
+      "Sergey Ovchinnikov",
+      "Alexander Rives"
+    ],
+    "emails": [
+      "~Tom_Sercu1",
+      "~Roshan_Rao1",
+      "~Alexander_Rives1",
+      "~Joshua_Meier1",
+      "~Sergey_Ovchinnikov1"
+    ],
+    "rank": 1040
+  },
+  {
+    "url": "https://openreview.net/forum?id=YjNv-hzM8BE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      9,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.78",
+    "confidences": [
+      5,
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Recent studies about learning multilingual representations have achieved significant performance gains across a wide range of downstream cross-lingual tasks. They train either an encoder-only Transformer mainly for understanding tasks, or an encoder-decoder Transformer specifically for generation tasks, ignoring the correlation between the two tasks and frameworks. In contrast, this paper presents a variable encoder-decoder (VECO) pre-training approach to unify the two mainstreams in both model architectures and pre-training tasks. VECO splits the standard Transformer block into several sub-modules trained with both inner-sequence and cross-sequence masked language modeling, and correspondingly reorganizes certain sub-modules for understanding and generation tasks during inference. Such a workflow not only ensures to train the most streamlined parameters necessary for two kinds of tasks, but also enables them to boost each other via sharing common sub-modules. As a result, VECO delivers new state-of-the-art results on various cross-lingual understanding tasks of the XTREME benchmark covering text classification, sequence labeling, question answering, and sentence retrieval. For generation tasks, VECO also outperforms all existing cross-lingual models and state-of-the-art Transformer variants on WMT14 English-to-German and English-to-French translation datasets, with gains of up to 1~2 BLEU.",
+    "title": "VECO: Variable Encoder-decoder Pre-training for Cross-lingual Understanding and Generation",
+    "authors": [
+      "Fuli Luo",
+      "Wei Wang",
+      "Jiahao Liu",
+      "Yijia Liu",
+      "Bin Bi",
+      "Songfang Huang",
+      "Fei Huang",
+      "Luo Si"
+    ],
+    "emails": [
+      "~Songfang_Huang1",
+      "~Jiahao_Liu2",
+      "~Wei_Wang41",
+      "~Fuli_Luo1",
+      "~Luo_Si3",
+      "~Yijia_Liu1",
+      "~Bin_Bi1",
+      "~Fei_Huang2"
+    ],
+    "rank": 1041
+  },
+  {
+    "url": "https://openreview.net/forum?id=GIeGTl8EYx",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.77",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "While Graph Neural Networks (GNNs) are powerful models for learning representations on graphs, most state-of-the-art models do not have significant accuracy gain beyond two to three layers. Deep GNNs fundamentally need to address: 1). expressivity challenge due to oversmoothing, and 2). computation challenge due to neighborhood explosion. We propose a simple \"deep GNN, shallow sampler\" design principle to improve both the GNN accuracy and efficiency --- to generate representation of a target node, we use a deep GNN to pass messages only within a shallow, localized subgraph. A properly sampled subgraph may exclude irrelevant or even noisy nodes, and still preserve the critical neighbor features and graph structures. The deep GNN then smooths the informative local signals to enhance feature learning, rather than oversmoothing the global graph signals into just \"white noise\". We theoretically justify why the combination of deep GNNs with shallow samplers yields the best learning performance. We then propose various sampling algorithms and neural architecture extensions to achieve good empirical results. Experiments on five large graphs show that our models achieve significantly higher accuracy and efficiency, compared with state-of-the-art. ",
+    "title": "Deep Graph Neural Networks with Shallow Subgraph Samplers",
+    "authors": [
+      "Hanqing Zeng",
+      "Muhan Zhang",
+      "Yinglong Xia",
+      "Ajitesh Srivastava",
+      "Rajgopal Kannan",
+      "Viktor Prasanna",
+      "Long Jin",
+      "Andrey Malevich",
+      "Ren Chen"
+    ],
+    "emails": [
+      "~Viktor_Prasanna1",
+      "~Hanqing_Zeng1",
+      "~Muhan_Zhang1",
+      "usc.edu",
+      "fb.com",
+      "~Ajitesh_Srivastava1"
+    ],
+    "rank": 1042
+  },
+  {
+    "url": "https://openreview.net/forum?id=aa0705s2Qc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Self-supervised learning and data augmentation have significantly reduced the performance gap between state and image-based reinforcement learning agents in continuous control tasks. However, it is still unclear whether current techniques can face the variety of visual conditions required by real-world environments. We propose a challenging benchmark that tests agents' visual generalization by adding graphical variety to existing continuous control domains. Our empirical analysis shows that current methods struggle to generalize across a diverse set of visual changes, and we examine the specific factors of variation that make these tasks difficult. We find that data augmentation techniques outperform self-supervised learning approaches and that more significant image transformations provide better visual generalization.",
+    "title": "Measuring Visual Generalization in Continuous Control from Pixels",
+    "authors": [
+      "Jake Grigsby",
+      "Yanjun Jane Qi"
+    ],
+    "emails": [
+      "~Jake_Grigsby1",
+      "~Yanjun_Jane_Qi1"
+    ],
+    "rank": 1043
+  },
+  {
+    "url": "https://openreview.net/forum?id=P42rXLGZQ07",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.77",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Discrete latent variables are considered important to model the generation process of real world data, which has motivated research on Variational Autoencoders (VAEs) with discrete latents. However, standard VAE training is not possible in this case, which has motivated different strategies to manipulate discrete distributions in order to train discrete VAEs similarly to conventional ones.\nHere we ask if it is also possible to keep the discrete nature of the latents fully intact by applying a direct discrete optimization for the encoding model. The studied approach is consequently strongly diverting from standard VAE training by altogether sidestepping absolute standard VAE mechanisms such as sampling approximation, reparameterization trick and amortization.                                                                              \n\nDiscrete optimization is realized in a variational setting using truncated posteriors in conjunction with evolutionary algorithms (using a recently suggested approach). For VAEs with binary latents, we first show how such a discrete variational method (A)~ties into gradient ascent for network weights and (B)~uses the decoder network to select latent states for training.                       \n                                                                                                    \nMore conventional amortized training is, as may be expected, more efficient than direct discrete optimization, and applicable to large neural networks.\nHowever, we here find direct optimization to be efficiently scalable to hundreds of latent variables using smaller networks.\nMore importantly, we find the effectiveness of direct optimization to be highly competitive in 'zero-shot' learning (where high effectiveness for small networks is required).\nIn contrast to large supervised neural networks, the here investigated VAEs can denoise a single image without previous training on clean data and/or training on large image datasets.                                                \n\nMore generally, the studied approach shows that training of VAEs is indeed possible without sampling-based approximation and reparameterization, which may be interesting for the analysis of VAE training in general. In the regime of few data, direct optimization, furthermore, makes VAEs competitive for denoising where they have previously been outperformed by non-generative approaches.",
+    "title": "Direct Evolutionary Optimization of Variational Autoencoders with Binary Latents",
+    "authors": [
+      "Enrico Guiraud",
+      "Jakob Drefs",
+      "Jorg Lucke"
+    ],
+    "emails": [
+      "~Jakob_Drefs1",
+      "~Jorg_Lucke1",
+      "~Enrico_Guiraud1"
+    ],
+    "rank": 1044
+  },
+  {
+    "url": "https://openreview.net/forum?id=_Tf6jEzbH9",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose a novel setting for learning, where the input domain is the image of a map defined on the product of two sets, one of which completely determines the labels. Given the ability to sample from each set independently, we present an algorithm that learns a classifier over the input domain more efficiently than sampling from the input domain directly. We apply this setting to visual classification tasks, where our approach enables us to train classifiers on datasets that consist entirely of a single example of each class. On several standard benchmarks for real-world image classification, our approach achieves performance competitive with state-of-the-art results from the few-shot learning and domain transfer literature, while using significantly less data.",
+    "title": "Context-Agnostic Learning Using Synthetic Data",
+    "authors": [
+      "Charles Jin",
+      "Martin Rinard"
+    ],
+    "emails": [
+      "~Charles_Jin1",
+      "~Martin_Rinard1"
+    ],
+    "rank": 1045
+  },
+  {
+    "url": "https://openreview.net/forum?id=CGFN_nV1ql",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      8,
+      4
+    ],
+    "rating": "5.77",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper presents Non-Attentive Tacotron based on the Tacotron 2 text-to-speech model, replacing the attention mechanism with an explicit duration predictor. This improves robustness significantly as measured by unaligned duration ratio and word deletion rate, two metrics introduced in this paper for large-scale robustness evaluation using a pre-trained speech recognition model. With the use of Gaussian upsampling, Non-Attentive Tacotron achieves a 5-scale mean opinion score for naturalness of 4.41, slightly outperforming Tacotron 2. The duration predictor enables both utterance-wide and per-phoneme control of duration at inference time. When accurate target durations are scarce or unavailable in the training data, we propose a method using a fine-grained variational auto-encoder to train the duration predictor in a semi-supervised or unsupervised manner, with results almost as good as supervised training.",
+    "title": "Non-Attentive Tacotron: Robust and controllable neural TTS synthesis including unsupervised duration modeling",
+    "authors": [
+      "Jonathan Shen",
+      "Ye Jia",
+      "Mike Chrzanowski",
+      "Yu Zhang",
+      "Isaac Elias",
+      "Heiga Zen",
+      "Yonghui Wu"
+    ],
+    "emails": [
+      "~Yonghui_Wu1",
+      "~Heiga_Zen1",
+      "~Mike_Chrzanowski2",
+      "~Ye_Jia1",
+      "google.com",
+      "~Jonathan_Shen1",
+      "~Yu_Zhang2"
+    ],
+    "rank": 1046
+  },
+  {
+    "url": "https://openreview.net/forum?id=ctgsGEmWjDY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.77",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "3D point clouds play pivotal roles in various safety-critical fields, such as autonomous driving, which desires the corresponding deep neural networks to be robust to adversarial perturbations. Though a few defenses against adversarial point cloud classification have been proposed, it remains unknown whether they can provide real robustness. To this end, we perform the first security analysis of state-of-the-art defenses and design adaptive attacks on them. Our 100% adaptive attack success rates demonstrate that current defense designs are still vulnerable. Since adversarial training (AT) is believed to be the most effective defense, we present the first in-depth study showing how AT behaves in point cloud classification and identify that the required symmetric function (pooling operation) is paramount to the model's robustness under AT. Through our systematic analysis, we find that the default used fixed pooling operations (e.g., MAX pooling) generally weaken AT's performance in point cloud classification. Still, sorting-based parametric pooling operations can significantly improve the models' robustness. Based on the above insights, we further propose DeepSym, a deep symmetric pooling operation, to architecturally advance the adversarial robustness under AT to 47.01% without sacrificing nominal accuracy, outperforming the original design and a strong baseline by 28.5% (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>2.6</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>) and 6.5%, respectively, in PointNet. ",
+    "title": "On The Adversarial Robustness of 3D Point Cloud Classification",
+    "authors": [
+      "Jiachen Sun",
+      "Karl Koenig",
+      "Yulong Cao",
+      "Qi Alfred Chen",
+      "Zhuoqing Mao"
+    ],
+    "emails": [
+      "~Zhuoqing_Mao1",
+      "~Jiachen_Sun1",
+      "~Qi_Alfred_Chen1",
+      "umich.edu"
+    ],
+    "rank": 1047
+  },
+  {
+    "url": "https://openreview.net/forum?id=T3RyQtRHebj",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      7
+    ],
+    "rating": "5.77",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "In contrast to traditional weight optimization in a continuous space, we demonstrate the existence of effective random networks whose weights are never updated. By selecting a weight among a fixed set of random values for each individual connection, our method uncovers combinations of random weights that match the performance of trained networks of the same capacity. We refer to our networks as ``slot machines'' where each reel (connection) contains a fixed set of symbols (random values). Our backpropagation algorithm ``spins'' the reels to seek ``winning''  combinations, i.e., selections of random weight values that minimize the given loss. Quite surprisingly, we find that allocating just a few random values to each connection (e.g., 8 values per connection) yields highly competitive combinations despite being dramatically more constrained compared to traditionally learned weights. Moreover, finetuning these combinations often improves performance over the trained baselines.  A randomly initialized VGG-19 with 8 values per connection contains a combination that achieves 90% test accuracy on CIFAR-10.  Our method also achieves an impressive performance of 98.1% on MNIST for neural networks containing only random weights. ",
+    "title": "Slot Machines: Discovering Winning Combinations of Random Weights in Neural Networks",
+    "authors": [
+      "Maxwell Mbabilla Aladago",
+      "Lorenzo Torresani"
+    ],
+    "emails": [
+      "~Lorenzo_Torresani1",
+      "~Maxwell_Mbabilla_Aladago1"
+    ],
+    "rank": 1048
+  },
+  {
+    "url": "https://openreview.net/forum?id=F2v4aqEL6ze",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.76",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a general, yet simple patch that can be applied to existing regularization-based continual learning methods called classifier-projection regularization (CPR). Inspired by both recent results on neural networks with wide local minima and information theory, CPR adds an additional regularization term that maximizes the entropy of a classifier's output probability. We demonstrate that this additional term can be interpreted as a projection of the conditional probability given by a classifier's output to the uniform distribution. By applying the Pythagorean theorem for KL divergence, we then prove that this projection may (in theory) improve the performance of continual learning methods. In our extensive experimental results, we apply CPR to several state-of-the-art regularization-based continual learning methods and benchmark performance on popular image recognition datasets. Our results demonstrate that CPR indeed promotes a wide local minima and significantly improves both accuracy and plasticity while simultaneously mitigating the catastrophic forgetting of baseline continual learning methods. The codes and scripts for this work are available at <a href=\"https://github.com/csm9493/CPR_CL\" target=\"_blank\" rel=\"nofollow\">https://github.com/csm9493/CPR_CL</a>.",
+    "title": "CPR: Classifier-Projection Regularization for Continual Learning",
+    "authors": [
+      "Sungmin Cha",
+      "Hsiang Hsu",
+      "Taebaek Hwang",
+      "Flavio Calmon",
+      "Taesup Moon"
+    ],
+    "emails": [
+      "~Taesup_Moon1",
+      "~Flavio_Calmon1",
+      "~Hsiang_Hsu1",
+      "gmail.com",
+      "~Sungmin_Cha1"
+    ],
+    "rank": 1049
+  },
+  {
+    "url": "https://openreview.net/forum?id=-Qaj4_O3cO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Spiking Neural Networks (SNNs) offer a promising alternative to traditional deep learning frameworks, since they provide higher \ncomputational efficiency due to event-driven information processing. SNNs distribute the analog values of pixel intensities into binary spikes over time. However, the most widely used input coding schemes, such as Poisson based rate-coding, do not leverage the additional temporal learning capability of SNNs effectively. Moreover, these SNNs suffer from high inference latency which is a major bottleneck to their deployment. To overcome this, we propose a scalable time-based encoding scheme that utilizes the Discrete Cosine Transform (DCT) to reduce the number of timesteps required for inference. DCT decomposes an image into a weighted sum of sinusoidal basis images. At each time step, a single frequency base, taken in order and modulated\nby its corresponding DCT coefficient, is input to an accumulator that generates spikes upon crossing a threshold. We use the proposed scheme to learn DCT-SNN, a low-latency deep SNN with leaky-integrate-and-fire neurons, trained using surrogate gradient descent based backpropagation. We achieve top-1 accuracy of 89.94%, 68.3% and 52.43% on CIFAR-10, CIFAR-100 and TinyImageNet, respectively using VGG architectures. Notably, DCT-SNN performs inference with 2-14X reduced latency compared to other state-of-the-art SNNs, while achieving comparable accuracy to their standard deep learning counterparts. The dimension of the transform allows us to control the number of timesteps required for inference. Additionally, we can trade-off accuracy with latency in a principled manner by dropping the highest frequency components during inference.",
+    "title": "DCT-SNN: Using DCT to Distribute Spatial Information over Time for Learning Low-Latency Spiking Neural Networks",
+    "authors": [
+      "Isha Garg",
+      "Sayeed Shafayet Chowdhury",
+      "Kaushik Roy"
+    ],
+    "emails": [
+      "~Isha_Garg1",
+      "~Kaushik_Roy1",
+      "~Sayeed_Shafayet_Chowdhury3"
+    ],
+    "rank": 1050
+  },
+  {
+    "url": "https://openreview.net/forum?id=d-XzF81Wg1",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      4
+    ],
+    "rating": "5.75",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Batch normalization (BatchNorm) has become a standard technique in deep learning. Its popularity is in no small part due to its often positive effect on generalization. Despite this success, the regularization effect of the technique is still poorly understood. This study aims to decompose BatchNorm into separate mechanisms that are much simpler. We identify three effects of BatchNorm and assess their impact directly with ablations and interventions. Our experiments show that preventing explosive growth at the final layer at initialization and during training can recover a large part of BatchNorm's generalization boost. This regularization mechanism can lift accuracy by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2.9</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for Resnet-50 on Imagenet without BatchNorm. We show it is linked to other methods like Dropout and recent initializations like Fixup. Surprisingly, this simple mechanism matches the improvement of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.9</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> of the more complex Dropout regularization for the state-of-the-art Efficientnet-B8 model on Imagenet. This demonstrates the underrated effectiveness of simple regularizations and sheds light on directions to further improve generalization for deep nets.",
+    "title": "Deconstructing the Regularization of BatchNorm",
+    "authors": [
+      "Yann Dauphin",
+      "Ekin Dogus Cubuk"
+    ],
+    "emails": [
+      "~Ekin_Dogus_Cubuk1",
+      "~Yann_Dauphin1"
+    ],
+    "rank": 1051
+  },
+  {
+    "url": "https://openreview.net/forum?id=ku4sJKvnbwV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The ability to construct and execute long-term plans enables intelligent agents to solve complex multi-step tasks and prevents myopic behavior only seeking the short-term reward. Recent work has achieved significant progress on building agents that can predict and plan from raw visual observations. However, existing visual planning methods still require a densely shaped reward that provides the algorithm with a short-term signal that is always easy to optimize. These algorithms fail when the shaped reward is not available as they use simplistic planning methods such as sampling-based random shooting and are unable to plan for a distant goal. Instead, to achieve long-horizon visual control, we propose to use collocation-based planning, a powerful optimal control technique that plans forward a sequence of states while constraining the transitions to be physical. We propose a planning algorithm that adapts collocation to visual planning by leveraging probabilistic latent variable models. A model-based reinforcement learning agent equipped with our planning algorithm significantly outperforms prior model-based agents on challenging visual control tasks with sparse rewards and long-term goals. ",
+    "title": "Model-Based Reinforcement Learning via Latent-Space Collocation",
+    "authors": [
+      "Oleh Rybkin",
+      "Chuning Zhu",
+      "Anusha Nagabandi",
+      "Kostas Daniilidis",
+      "Igor Mordatch",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Oleh_Rybkin1",
+      "~Anusha_Nagabandi1",
+      "seas.upenn.edu",
+      "~Igor_Mordatch4",
+      "~Sergey_Levine1",
+      "~Kostas_Daniilidis1"
+    ],
+    "rank": 1052
+  },
+  {
+    "url": "https://openreview.net/forum?id=EGVxmJKLC2L",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Animals are equipped with a rich innate repertoire of sensory, behavioral and motor skills, which allows them to interact with the world immediately after birth. At the same time, many behaviors are highly adaptive and can be tailored to specific environments by means of learning. In this work, we use mathematical analysis and the framework of meta-learning (or 'learning to learn') to answer when it is beneficial to learn such an adaptive strategy and when to hard-code a heuristic behavior. We find that the interplay of ecological uncertainty, task complexity and the agents' lifetime has crucial effects on the meta-learned amortized Bayesian inference performed by an agent. There exist two regimes: One in which meta-learning yields a learning algorithm that implements task-dependent information-integration and a second regime in which meta-learning imprints a heuristic or 'hard-coded' behavior. Further analysis reveals that non-adaptive behaviors are not only optimal for aspects of the environment that are stable across individuals, but also in situations where an adaptation to the environment would in fact be highly beneficial, but could not be done quickly enough to be exploited within the remaining lifetime. Hard-coded behaviors should hence not only be those that always work, but also those that are too complex to be learned within a reasonable time frame.",
+    "title": "Learning not to learn: Nature versus nurture in silico",
+    "authors": [
+      "Robert Tjarko Lange",
+      "Henning Sprekeler"
+    ],
+    "emails": [
+      "tu-berlin.de",
+      "~Robert_Tjarko_Lange1"
+    ],
+    "rank": 1053
+  },
+  {
+    "url": "https://openreview.net/forum?id=H-SPvQtMwm",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The dot product self-attention is known to be central and indispensable to state-of-the-art Transformer models. But is it really required? This paper investigates the true importance and contribution of the dot product-based self-attention mechanism on the performance of Transformer models. Via extensive experiments, we find that (1) random alignment matrices surprisingly perform quite competitively and (2) learning attention weights from token-token (query-key) interactions is useful but not that important after all. To this end, we propose \\textsc{Synthesizer}, a model that learns synthetic attention weights without token-token interactions. In our experiments, we first show that simple Synthesizers achieve highly competitive performance when compared against vanilla Transformer models across a range of tasks, including machine translation, language modeling, text generation and GLUE/SuperGLUE benchmarks. When composed with dot product attention, we find that Synthesizers consistently outperform Transformers. Moreover, we conduct additional comparisons of Synthesizers against Dynamic Convolutions, showing that simple Random Synthesizer is not only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>60</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> faster but also improves perplexity by a relative <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3.5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>. Finally, we show that simple factorized Synthesizers can outperform Linformers on encoding only tasks. ",
+    "title": "Synthesizer: Rethinking Self-Attention for Transformer Models",
+    "authors": [
+      "Yi Tay",
+      "Dara Bahri",
+      "Donald Metzler",
+      "Da-Cheng Juan",
+      "Zhe Zhao",
+      "Che Zheng"
+    ],
+    "emails": [
+      "~Dara_Bahri1",
+      "~Zhe_Zhao3",
+      "google.com",
+      "~Yi_Tay1",
+      "~Da-Cheng_Juan1"
+    ],
+    "rank": 1054
+  },
+  {
+    "url": "https://openreview.net/forum?id=N9oPAFcuYWX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "With the widespread deployment of large-scale prediction systems in high-stakes domains, e.g., face recognition, criminal justice, etc., disparity on prediction accuracy between different demographic subgroups has called for fundamental understanding on the source of such disparity and algorithmic intervention to mitigate it. In this paper, we study the accuracy disparity problem in regression. To begin with, we first propose an error decomposition theorem, which decomposes the accuracy disparity into the distance between label populations and the distance between conditional representations, to help explain why such accuracy disparity appears in practice. Motivated by this error decomposition and the general idea of distribution alignment with statistical distances, we then propose an algorithm to reduce this disparity, and analyze its game-theoretic optima of the proposed objective function. We conduct experiments on four real-world datasets. The experimental results suggest that our proposed algorithms can effectively mitigate accuracy disparity while maintaining the predictive power of the regression models.",
+    "title": "Understanding and Mitigating Accuracy Disparity in Regression",
+    "authors": [
+      "Jianfeng Chi",
+      "Han Zhao",
+      "Geoff Gordon",
+      "Yuan Tian"
+    ],
+    "emails": [
+      "~Yuan_Tian2",
+      "~Geoff_Gordon2",
+      "~Han_Zhao1",
+      "~Jianfeng_Chi1"
+    ],
+    "rank": 1055
+  },
+  {
+    "url": "https://openreview.net/forum?id=xVzlFUD3uC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We present a theoretical framework recasting data augmentation as stochastic optimization for a sequence of time-varying proxy losses. This provides a unified language for understanding techniques commonly thought of as data augmentation, including synthetic noise and label-preserving transformations, as well as more traditional ideas in stochastic optimization such as learning rate and batch size scheduling. We then specialize our framework to study arbitrary augmentations in the context of a simple model (overparameterized linear regression). We extend in this setting the classical Monro-Robbins theorem to include augmentation and obtain rates of convergence, giving conditions on the learning rate and augmentation schedule under which augmented gradient descent converges. Special cases give provably good schedules for augmentation with additive noise, minibatch SGD, and minibatch SGD with noise.",
+    "title": "Data augmentation as stochastic optimization",
+    "authors": [
+      "Boris Hanin",
+      "Yi Sun"
+    ],
+    "emails": [
+      "~Yi_Sun3",
+      "~Boris_Hanin1"
+    ],
+    "rank": 1056
+  },
+  {
+    "url": "https://openreview.net/forum?id=qSeqhriWKsn",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We study sampling algorithms for variance reduction methods for stochastic optimization. Although stochastic gradient descent (SGD) is widely used for large scale machine learning, it sometimes experiences slow convergence rates due to the high variance from uniform sampling. In this paper, we introduce an algorithm that approximately samples a gradient from the optimal distribution for a common finite-sum form with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> terms, while just making a single pass over the data, using input sparsity time, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-n\" style=\"color: red;\"><mjx-c class=\"mjx-c5C\"></mjx-c><mjx-c class=\"mjx-c74\"></mjx-c><mjx-c class=\"mjx-c4F\"></mjx-c></mjx-mtext><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathcolor=\"red\">\\tO</mtext><mrow><mi>T</mi><mi>d</mi></mrow></math></mjx-assistive-mml></mjx-container> space. Our algorithm can be implemented in big data models such as the streaming and distributed models. Moreover, we show that our algorithm can be generalized to approximately sample Hessians and thus provides variance reduction for second-order methods as well. We demonstrate the efficiency of our algorithm on large-scale datasets. ",
+    "title": "Adaptive Single-Pass Stochastic Gradient Descent in Input Sparsity Time",
+    "authors": [
+      "Sepideh Mahabadi",
+      "David Woodruff",
+      "Samson Zhou"
+    ],
+    "emails": [
+      "~David_Woodruff1",
+      "ttic.edu",
+      "~Samson_Zhou1"
+    ],
+    "rank": 1057
+  },
+  {
+    "url": "https://openreview.net/forum?id=zdrls6LIX4W",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "A fundamental challenge in multiagent reinforcement learning is to learn beneficial behaviors in a shared environment with other agents that are also simultaneously learning. In particular, each agent perceives the environment as effectively non-stationary due to the changing policies of other agents. Moreover, each agent is itself constantly learning, leading to natural nonstationarity in the distribution of experiences encountered. In this paper, we propose a novel meta-multiagent policy gradient theorem that directly accommodates for the non-stationary policy dynamics inherent to these multiagent settings. This is achieved by modeling our gradient updates to directly consider both an agent\u2019s own non-stationary policy dynamics and the non-stationary policy dynamics of other agents interacting with it in the environment. We find that our theoretically grounded approach provides a general solution to the multiagent learning problem, which inherently combines key aspects of previous state of the art approaches on this topic. We test our method on several multiagent benchmarks and demonstrate a more efficient ability to adapt to new agents as they learn than previous related approaches across the spectrum of mixed incentive, competitive, and cooperative environments.",
+    "title": "A Policy Gradient Algorithm for Learning to Learn in Multiagent Reinforcement Learning",
+    "authors": [
+      "Dong-Ki Kim",
+      "Miao Liu",
+      "Matthew D Riemer",
+      "Chuangchuang Sun",
+      "Marwa Abdulhai",
+      "Golnaz Habibi",
+      "Sebastian Lopez-Cot",
+      "Gerald Tesauro",
+      "JONATHAN P HOW"
+    ],
+    "emails": [
+      "~Miao_Liu1",
+      "~Chuangchuang_Sun1",
+      "~Golnaz_Habibi1",
+      "mit.edu",
+      "~Matthew_D_Riemer1",
+      "~Dong-Ki_Kim1",
+      "~Gerald_Tesauro1",
+      "~JONATHAN_P_HOW1"
+    ],
+    "rank": 1058
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q4EUywJIkqr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep object recognition models have been very successful over benchmark\ndatasets such as ImageNet. How accurate and robust are they to distribution\nshifts arising from natural and synthetic variations in datasets? Prior research on\nthis problem has primarily focused on ImageNet variations (e.g., ImageNetV2,\nImageNet-A). To avoid potential inherited biases in these studies, we take a\ndifferent approach. Specifically, we reanalyze the ObjectNet dataset recently\nproposed by Barbu et al. containing objects in daily life situations. They showed\na dramatic performance drop of the state of the art object recognition models on\nthis dataset. Due to the importance and implications of their results regarding\nthe generalization ability of deep models, we take a second look at their analysis.\nWe find that applying deep models to the isolated objects, rather than the entire\nscene as is done in the original paper, results in around 20-30% performance\nimprovement. Relative to the numbers reported in Barbu et al., around 10-15%\nof the performance loss is recovered, without any test time data augmentation.\nDespite this gain, however, we conclude that deep models still suffer drastically\non the ObjectNet dataset. We also investigate the robustness of models against\nsynthetic image perturbations such as geometric transformations (e.g., scale,\nrotation, translation), natural image distortions (e.g., impulse noise, blur) as well\nas adversarial attacks (e.g., FGSM and PGD-5). Our results indicate that limiting\nthe object area as much as possible (i.e., from the entire image to the bounding\nbox to the segmentation mask) leads to consistent improvement in accuracy and\nrobustness. Finally, through a qualitative analysis of ObjectNet data, we find that\ni) a large number of images in this dataset are hard to recognize even for humans,\nand ii) easy (hard) samples for models match with easy (hard) samples for humans.\nOverall, our analysis shows that ObjecNet is still a challenging test platform that\ncan be used to measure the generalization ability of models. The code and data\nare available in [masked due to blind review].",
+    "title": "Contemplating Real-World Object Classification",
+    "authors": [
+      "ali borji"
+    ],
+    "emails": [
+      "~ali_borji1"
+    ],
+    "rank": 1059
+  },
+  {
+    "url": "https://openreview.net/forum?id=fESskTMMSv",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Marginalized importance sampling (MIS), which measures the density ratio between the state-action occupancy of a target policy and that of a sampling distribution, is a promising approach for off-policy evaluation. However, current state-of-the-art MIS methods rely on complex optimization tricks and succeed mostly on simple toy problems. We bridge the gap between MIS and deep reinforcement learning by observing that the density ratio can be computed from the successor representation of the target policy. The successor representation can be trained through deep reinforcement learning methodology and decouples the reward optimization from the dynamics of the environment, making the resulting algorithm stable and applicable to high-dimensional domains. We evaluate the empirical performance of our approach on a variety of challenging Atari and MuJoCo environments.",
+    "title": "Practical Marginalized Importance Sampling with the Successor Representation",
+    "authors": [
+      "Scott Fujimoto",
+      "David Meger",
+      "Doina Precup"
+    ],
+    "emails": [
+      "~Scott_Fujimoto1",
+      "~Doina_Precup1",
+      "~David_Meger2"
+    ],
+    "rank": 1060
+  },
+  {
+    "url": "https://openreview.net/forum?id=GSTrduvZSjT",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Adaptive gradient methods are typically used for training over-parameterized models capable of exactly fitting the data; we thus study their convergence in this interpolation setting. Under an interpolation assumption, we prove that AMSGrad with a constant step-size and momentum can converge to the minimizer at the faster <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>T</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> rate for smooth, convex functions. Furthermore, in this setting, we show that AdaGrad can achieve an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret in the online convex optimization framework. When interpolation is only approximately satisfied, we show that constant step-size AMSGrad converges to a neighbourhood of the solution. On the other hand, we prove that AdaGrad is robust to the violation of interpolation and converges to the minimizer at the optimal rate. However, we demonstrate that even for simple, convex problems satisfying interpolation, the empirical performance of these methods heavily depends on the step-size and requires tuning. We alleviate this problem by using stochastic line-search (SLS) and Polyak's step-sizes (SPS) to help these methods adapt to the function's local smoothness. By using these techniques, we prove that AdaGrad and AMSGrad do not require knowledge of problem-dependent constants and retain the convergence guarantees of their constant step-size counterparts. Experimentally, we show that these techniques help improve the convergence and generalization performance across tasks, from binary classification with kernel mappings to classification with deep neural networks.",
+    "title": "Adaptive Gradient Methods Converge Faster with Over-Parameterization (and you can do a line-search)",
+    "authors": [
+      "Sharan Vaswani",
+      "Issam H. Laradji",
+      "Frederik Kunstner",
+      "Si Yi Meng",
+      "Mark Schmidt",
+      "Simon Lacoste-Julien"
+    ],
+    "emails": [
+      "~Frederik_Kunstner1",
+      "~Issam_H._Laradji1",
+      "~Mark_Schmidt1",
+      "~Simon_Lacoste-Julien1",
+      "~Sharan_Vaswani1",
+      "~Si_Yi_Meng1"
+    ],
+    "rank": 1061
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ms9zjhVB5R",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Adversarial training is a common approach to improving the robustness of deep neural networks against adversarial examples. In this work, we propose a novel regularization approach as an alternative. To derive the regularizer, we formulate the adversarial robustness problem under the robust optimization framework and approximate the loss function using a second-order Taylor series expansion. Our proposed second-order adversarial regularizer (SOAR) is an upper bound based on the Taylor approximation of the inner-max in the robust optimization objective. We empirically show that the proposed method improves the robustness of networks against the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> bounded perturbations on CIFAR-10 and SVHN.",
+    "title": "SOAR: Second-Order Adversarial Regularization",
+    "authors": [
+      "Avery Ma",
+      "Fartash Faghri",
+      "Nicolas Papernot",
+      "Amir-massoud Farahmand"
+    ],
+    "emails": [
+      "~Avery_Ma1",
+      "~Amir-massoud_Farahmand1",
+      "~Fartash_Faghri1",
+      "~Nicolas_Papernot1"
+    ],
+    "rank": 1062
+  },
+  {
+    "url": "https://openreview.net/forum?id=lo7GKwmakFZ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In continuing control tasks, an agent\u2019s average reward per time step is a more natural performance measure compared to the commonly used discounting framework since it can better capture an agent\u2019s long-term behavior. We derive a novel lower bound on the difference of the long-term average reward for two policies.  The lower bound depends on the average divergence between the policies and on the so-called Kemeny constant, which measures to what degree the unichain Markov chains associated with the policies are well-connected. We also show that previous work based on the discounted return (Schulman et al., 2015; Achiam et al., 2017) results in a non-meaningful lower bound in the average reward setting. Based on our lower bound, we develop an iterative procedure which produces a sequence of monotonically improved policies for the average reward criterion. When combined with Deep Reinforcement Learning (DRL) methods, the procedure leads to scalable and efficient algorithms for maximizing the agent\u2019s average reward performance.Empirically we demonstrate the effectiveness of our method on continuing control tasks and show how discounting can lead to unsatisfactory performance.",
+    "title": "Average Reward Reinforcement Learning with Monotonic Policy Improvement",
+    "authors": [
+      "Yiming Zhang",
+      "Keith W. Ross"
+    ],
+    "emails": [
+      "~Keith_W._Ross1",
+      "~Yiming_Zhang1"
+    ],
+    "rank": 1063
+  },
+  {
+    "url": "https://openreview.net/forum?id=EdXhmWvvQV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent advances in unsupervised representation learning have experienced remarkable progress, especially with the achievements of contrastive learning, which regards each image as well its augmentations as a separate class, while does not consider the semantic similarity among images. This paper proposes a new kind of data augmentation, named Center-wise Local Image Mixture, to expand the neighborhood space of an image. CLIM encourages both local similarity and global aggregation while pulling similar images. This is achieved by searching local similar samples of an image, and only selecting images that are closer to the corresponding cluster center, which we denote as center-wise local selection. As a result, similar representations are progressively approaching the clusters, while do not break the local similarity. Furthermore, image mixture is used as a smoothing regularization to avoid overconfident the selected samples. Besides, we introduce multi-resolution augmentation, which enables the representation to be scale invariant. Integrating the two augmentations produces better feature representation on several unsupervised benchmarks. Notably, we reach 75.5% top-1 accuracy with linear evaluation over ResNet-50, and 59.3% top-1 accuracy when fine-tuned with only 1% labels, as well as consistently outperforming supervised pretraining on several downstream transfer tasks.",
+    "title": "Center-wise Local Image Mixture For Contrastive Representation Learning",
+    "authors": [
+      "Hao Li",
+      "XIAOPENG ZHANG",
+      "Ruoyu Sun",
+      "Hongkai Xiong",
+      "Qi Tian"
+    ],
+    "emails": [
+      "~Ruoyu_Sun2",
+      "~Hongkai_Xiong1",
+      "~XIAOPENG_ZHANG7",
+      "~Qi_Tian3",
+      "~Hao_Li11"
+    ],
+    "rank": 1064
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ut1vF_q_vC",
+    "decision": "Accept (Spotlight)",
+    "ratings": [
+      6,
+      2,
+      8,
+      8
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Despite the success of neural models on many major machine learning problems, their effectiveness on traditional Learning-to-Rank (LTR) problems is still not widely acknowledged. We first validate this concern by showing that most recent neural LTR models are, by a large margin, inferior to the best publicly available Gradient Boosted Decision Trees (GBDT) in terms of their reported ranking accuracy on benchmark datasets. This unfortunately was somehow overlooked in recent neural LTR papers. We then investigate why existing neural LTR models under-perform and identify several of their weaknesses. Furthermore, we propose a unified framework comprising of counter strategies to ameliorate the existing weaknesses of neural models. Our models are the first to be able to perform equally well, comparing with the best tree-based baseline, while outperforming recently published neural LTR models by a large margin. Our results can also serve as a benchmark to facilitate future improvement of neural LTR models.",
+    "title": "Are Neural Rankers still Outperformed by Gradient Boosted Decision Trees?",
+    "authors": [
+      "Zhen Qin",
+      "Le Yan",
+      "Honglei Zhuang",
+      "Yi Tay",
+      "Rama Kumar Pasumarthi",
+      "Xuanhui Wang",
+      "Michael Bendersky",
+      "Marc Najork"
+    ],
+    "emails": [
+      "~Xuanhui_Wang1",
+      "google.com",
+      "~Yi_Tay1",
+      "~Zhen_Qin5",
+      "~Rama_Kumar_Pasumarthi1",
+      "~Honglei_Zhuang1"
+    ],
+    "rank": 1065
+  },
+  {
+    "url": "https://openreview.net/forum?id=n5ej38Vfuup",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      2,
+      5,
+      5
+    ],
+    "abstract": "One of the difficulties in modeling real-world data is their complex multi-manifold structure due to discrete features. In this paper, we propose quotient manifold modeling (QMM), a new data-modeling scheme that considers generic manifold structure independent of discrete features, thereby deriving efficiency in modeling and allowing generalization over untrained manifolds. QMM considers a deep encoder inducing an equivalence between manifolds; but we show it is sufficient to consider it only implicitly via a bias-regularizer we derive. This makes QMM easily applicable to existing models such as GANs and VAEs, and experiments show that these models not only present superior FID scores but also make good generalizations across different datasets. In particular, we demonstrate an MNIST model that synthesizes EMNIST alphabets.",
+    "title": "Deep Quotient Manifold Modeling",
+    "authors": [
+      "Jiseob Kim",
+      "Seungjae Jung",
+      "Hyundo Lee",
+      "Byoung-Tak Zhang"
+    ],
+    "emails": [
+      "~Byoung-Tak_Zhang1",
+      "bi.snu.ac.kr",
+      "~Jiseob_Kim1"
+    ],
+    "rank": 1066
+  },
+  {
+    "url": "https://openreview.net/forum?id=zM6fevLxIhI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "State-of-the-art performances in dense pixel-wise prediction tasks are obtained with specifically designed convolutional networks. These models often benefit from attention mechanisms that allow better learning of deep representations. Recent works showed the importance of estimating both spatial- and channel-wise attention tensors. In this paper, we propose a unified approach to jointly estimate spatial attention maps and channel attention vectors so as to structure the resulting attention tensor. Moreover, we integrate the estimation of the attention within a probabilistic framework, leading to VarIational STructured Attention networks(VISTA). We implement the inference rules within the neural network, thus allowing for joint learning of the probabilistic and the CNN front-end parameters. Importantly, as demonstrated by our extensive empirical evaluation on six large-scale datasets VISTA outperforms the state-of-the-art in multiple continuous and discrete pixel-level prediction tasks, thus confirming the benefit of structuring the attention tensor and of inferring it within a probabilistic formulation.",
+    "title": "Variational Structured Attention Networks for Dense Pixel-Wise Prediction",
+    "authors": [
+      "Guanglei Yang",
+      "Paolo Rota",
+      "Xavier Alameda-Pineda",
+      "Dan Xu",
+      "Mingli Ding",
+      "Elisa Ricci"
+    ],
+    "emails": [
+      "~Xavier_Alameda-Pineda1",
+      "~Dan_Xu4",
+      "~Elisa_Ricci1",
+      "hit.edu.cn",
+      "~Paolo_Rota1",
+      "~Guanglei_Yang1"
+    ],
+    "rank": 1067
+  },
+  {
+    "url": "https://openreview.net/forum?id=jWXBUsWP7N",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      2,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Recent distributional reinforcement learning methods, despite their successes, still contain fundamental problems that can lead to inaccurate representations of value distributions, such as distributional instability, action type restriction, and conflation between samples and statistics. In this paper, we present a novel distributional actor-critic framework, GMAC, to address such problems. Adopting a stochastic policy removes the first two problems, and the conflation in the approximation is alleviated by minimizing the Crame \u0301r distance between the value distribution and its Bellman target distribution. In addition, GMAC improves data efficiency by generating the Bellman target distribution through the Sample-Replacement algorithm, denoted by SR(\u03bb), which provides a distributional generalization of multi-step policy evaluation algorithms. We empirically show that our method captures the multimodality of value distributions and improves the performance of a conventional actor-critic method with low computational cost in both discrete and continuous action spaces, using Arcade Learning Environment (ALE) and PyBullet environment.",
+    "title": "A Distributional Perspective on Actor-Critic Framework",
+    "authors": [
+      "Daniel Wontae Nam",
+      "Younghoon Kim",
+      "Chan Youn Park"
+    ],
+    "emails": [
+      "~Younghoon_Kim1",
+      "~Daniel_Wontae_Nam1",
+      "~Chan_Youn_Park1"
+    ],
+    "rank": 1068
+  },
+  {
+    "url": "https://openreview.net/forum?id=083vV3utxpC",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Emerging edge intelligence applications require the server to continuously retrain and update deep neural networks deployed on remote edge nodes to leverage newly collected data samples. Unfortunately, it may be impossible in practice to continuously send fully updated weights to these edge nodes due to the highly constrained communication resource. In this paper, we propose the weight-wise deep partial updating paradigm, which smartly selects only a subset of weights to update at each server-to-edge communication round, while achieving a similar performance compared to full updating. Our method is established through analytically upper-bounding the loss difference between partial updating and full updating, and only updates the weights which make the largest contributions to the upper bound. Extensive experimental results demonstrate the efficacy of our partial updating methodology which achieves a high inference accuracy while updating a rather small number of weights.",
+    "title": "Deep Partial Updating",
+    "authors": [
+      "Zhongnan Qu",
+      "Cong Liu",
+      "Junfeng Guo",
+      "Lothar Thiele"
+    ],
+    "emails": [
+      "~Zhongnan_Qu1",
+      "~Cong_Liu2",
+      "~Lothar_Thiele1",
+      "~Junfeng_Guo2"
+    ],
+    "rank": 1069
+  },
+  {
+    "url": "https://openreview.net/forum?id=V1N4GEWki_E",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Sparse Neural Networks (NNs) can match the generalization of dense NNs using a fraction of the compute/storage for inference, and also have the potential to enable efficient training. However, naively training unstructured sparse NNs from random initialization results in significantly worse generalization, with the notable exception of Lottery Tickets (LTs) and Dynamic Sparse Training (DST). In this work, we attempt to answer: (1) why training unstructured sparse networks from random initialization performs poorly and; and (2) what makes LTs and DST the exceptions? We show that sparse NNs have poor gradient flow at initialization and propose a modified initialization for unstructured connectivity. Furthermore, we find that DST methods significantly improve gradient flow during training over traditional sparse training methods. Finally, we show that LTs do not improve gradient flow, rather their success lies in re-learning the pruning solution they are derived from \u2014 however, this comes at the cost of learning novel solutions.",
+    "title": "Gradient Flow in Sparse Neural Networks and How Lottery Tickets Win",
+    "authors": [
+      "Utku Evci",
+      "Yani Ioannou",
+      "Cem Keskin",
+      "Yann Dauphin"
+    ],
+    "emails": [
+      "~Yani_Ioannou1",
+      "~Cem_Keskin2",
+      "~Yann_Dauphin1",
+      "~Utku_Evci1"
+    ],
+    "rank": 1070
+  },
+  {
+    "url": "https://openreview.net/forum?id=PGmqOzKEPZN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The estimation of the ratio of two probability densities has garnered attention as the density ratio is useful in various machine learning tasks, such as anomaly detection and domain adaptation. To estimate the density ratio, methods collectively known as direct density ratio estimation (DRE) have been explored. These methods are based on the minimization of the Bregman (BR) divergence between a density ratio model and the true density ratio. However, existing direct DRE suffers from serious overfitting when using flexible models such as neural networks. In this paper, we introduce a non-negative correction for empirical risk using only the prior knowledge of the upper bound of the density ratio. This correction makes a DRE method more robust against overfitting and enables the use of flexible models. In the theoretical analysis, we discuss the consistency of the empirical risk. In our experiments, the proposed estimators show favorable performance in inlier-based outlier detection and covariate shift adaptation.",
+    "title": "Non-Negative Bregman Divergence Minimization for Deep Direct Density Ratio Estimation",
+    "authors": [
+      "Masahiro Kato",
+      "Takeshi Teshima"
+    ],
+    "emails": [
+      "~Takeshi_Teshima1",
+      "~Masahiro_Kato1"
+    ],
+    "rank": 1071
+  },
+  {
+    "url": "https://openreview.net/forum?id=MY3WGKsXct_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Membership inference attacks (MIA) try to detect if data samples were used to train a Neural Network model. As training data is very valuable in machine learning, MIA can be used to detect the use of unauthorized data. Unlike the traditional MIA approaches, addressing classification models, we address conditional image generation models (e.g. image translation). \nDue to overfitting, reconstruction errors are typically lower for images used in training. A simple but effective approach for membership attacks can therefore use the reconstruction error.\nHowever, we observe that some images are \"universally\" easy, and others are difficult. Reconstruction error alone is less effective at discriminating between difficult images used in training and easy images that were never seen before. To overcome this, we propose to use a novel difficulty score that can be computed for each image, and its computation does not require a training set. Our membership error, obtained by subtracting the difficulty score from the reconstruction error, is shown to achieve high MIA accuracy on an extensive number of benchmarks.",
+    "title": "Membership Attacks on Conditional Generative Models Using Image Difficulty",
+    "authors": [
+      "Avital Shafran",
+      "Shmuel Peleg",
+      "Yedid Hoshen"
+    ],
+    "emails": [
+      "~Shmuel_Peleg1",
+      "~Yedid_Hoshen3",
+      "mail.huji.ac.il"
+    ],
+    "rank": 1072
+  },
+  {
+    "url": "https://openreview.net/forum?id=PiKUvDj5jyN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "In psychology, relational learning refers to the ability to recognize and respond to\nrelationship among objects irrespective of the nature of those objects. Relational\nlearning has long been recognized as a hallmark of human cognition and a key\nquestion in artificial intelligence research. In this work, we propose an unsupervised\nlearning method for addressing the relational learning problem where we\nlearn the underlying relationship between a pair of data irrespective of the nature\nof those data. The central idea of the proposed method is to encapsulate the relational\nlearning problem with a probabilistic graphical model in which we perform\ninference to learn about data relationships and other relational processing tasks.",
+    "title": "Relational Learning with Variational Bayes",
+    "authors": [
+      "Kuang-Hung Liu"
+    ],
+    "emails": [
+      "~Kuang-Hung_Liu1"
+    ],
+    "rank": 1073
+  },
+  {
+    "url": "https://openreview.net/forum?id=AVKFuhH1Fo4",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Despite their ubiquity in core AI fields like natural language processing, the mechanics of deep attention-based neural networks like the ``Transformer'' model are not fully understood. In this article, we present a new perspective towards understanding how Transformers work. In particular, we show that the ``\"dot-product attention\" that is the core of the Transformer's operation can be characterized as a kernel learning method on a pair of Banach spaces. In particular, the Transformer's kernel is characterized as having an infinite feature dimension. Along the way we generalize the standard kernel learning problem to what we term a \"binary\" kernel learning problem, where data come from two input domains and a response is defined for every cross-domain pair. We prove a new representer theorem for these binary kernel machines with non-Mercer (indefinite, asymmetric) kernels (implying that the functions learned are elements of reproducing kernel Banach spaces rather than Hilbert spaces), and also prove a new universal approximation theorem showing that the Transformer calculation can learn any binary non-Mercer reproducing kernel Banach space pair. We experiment with new kernels in Transformers, and obtain results that suggest the infinite dimensionality of the standard Transformer kernel is partially responsible for its performance. This paper's results provide a new theoretical understanding of a very important but poorly understood model in modern machine learning.",
+    "title": "Transformers are Deep Infinite-Dimensional Non-Mercer Binary Kernel Machines",
+    "authors": [
+      "Matthew A Wright",
+      "Joseph E. Gonzalez"
+    ],
+    "emails": [
+      "~Joseph_E._Gonzalez1",
+      "~Matthew_A_Wright1"
+    ],
+    "rank": 1074
+  },
+  {
+    "url": "https://openreview.net/forum?id=YNnpaAKeCfx",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.75",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Training a fair machine learning model is essential to prevent demographic disparity. Existing techniques for improving model fairness require broad changes in either data preprocessing or model training, rendering themselves difficult-to-adopt for potentially already complex machine learning systems. We address this problem via the lens of bilevel optimization. While keeping the standard training algorithm as an inner optimizer, we incorporate an outer optimizer so as to equip the inner problem with an additional functionality: Adaptively selecting minibatch sizes for the purpose of improving model fairness. Our batch selection algorithm, which we call FairBatch, implements this optimization and supports prominent fairness measures: equal opportunity, equalized odds, and demographic parity. FairBatch comes with a significant implementation benefit -- it does not require any modification to data preprocessing or model training. For instance, a single-line change of PyTorch code for replacing batch selection part of model training suffices to employ FairBatch. Our experiments conducted both on synthetic and benchmark real data demonstrate that FairBatch can provide such functionalities while achieving comparable (or even greater) performances against the state of the arts.  Furthermore, FairBatch can readily improve fairness of any pre-trained model simply via fine-tuning. It is also compatible with existing batch selection techniques intended for different purposes, such as faster convergence, thus gracefully achieving multiple purposes.",
+    "title": "FairBatch: Batch Selection for Model Fairness",
+    "authors": [
+      "Yuji Roh",
+      "Kangwook Lee",
+      "Steven Euijong Whang",
+      "Changho Suh"
+    ],
+    "emails": [
+      "~Kangwook_Lee1",
+      "~Yuji_Roh1",
+      "~Changho_Suh1",
+      "~Steven_Euijong_Whang1"
+    ],
+    "rank": 1075
+  },
+  {
+    "url": "https://openreview.net/forum?id=cP5IcoAkfKa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      5,
+      7,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We accelerate deep reinforcement learning-based training in visually complex 3D environments by two orders of magnitude over prior work, realizing end-to-end training speeds of over 19,000 frames of experience per second on a single GPU and up to 72,000 frames per second on a single eight-GPU machine. The key idea of our approach is to design a 3D renderer and embodied navigation simulator around the principle of \u201cbatch simulation\u201d: accepting and executing large batches of requests simultaneously.  Beyond exposing large amounts of work at once, batch simulation allows implementations to amortize in-memory storage of scene assets, rendering work, data loading, and synchronization costs across many simulation requests, dramatically improving the number of simulated agents per GPU and overall simulation throughput.  To balance DNN inference and training costs with faster simulation, we also build a computationally efficient policy DNN that maintains high task performance, and modify training algorithms to maintain sample efficiency when training with large mini-batches. By combining batch simulation and DNN performance optimizations, we demonstrate that PointGoal navigation agents can be trained in complex 3D environments on a single GPU in 1.5 days to 97% of the accuracy of agents trained on a prior state-of-the-art system using a 64-GPU cluster over three days.  We provide open-source reference implementations of our batch 3D renderer and simulator to facilitate incorporation of these ideas into RL systems.",
+    "title": "Large Batch Simulation for Deep Reinforcement Learning",
+    "authors": [
+      "Brennan Shacklett",
+      "Erik Wijmans",
+      "Aleksei Petrenko",
+      "Manolis Savva",
+      "Dhruv Batra",
+      "Vladlen Koltun",
+      "Kayvon Fatahalian"
+    ],
+    "emails": [
+      "~Kayvon_Fatahalian2",
+      "~Vladlen_Koltun1",
+      "usc.edu",
+      "~Manolis_Savva1",
+      "~Dhruv_Batra1",
+      "~Brennan_Shacklett1",
+      "~Erik_Wijmans1"
+    ],
+    "rank": 1076
+  },
+  {
+    "url": "https://openreview.net/forum?id=qiydAcw6Re",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      1,
+      1,
+      2
+    ],
+    "abstract": "We present a new perspective on program synthesis in which programs may be identified with singularities of analytic functions. As an example, Turing machines are synthesised from input-output examples by propagating uncertainty through a smooth relaxation of a universal Turing machine.  The posterior distribution over weights is approximated using Markov chain Monte Carlo and bounds on the generalisation error of these models is estimated using the real log canonical threshold, a geometric invariant from singular learning theory.  \n",
+    "title": "Geometry of Program Synthesis",
+    "authors": [
+      "James Clift",
+      "Daniel Murfet",
+      "James Wallbridge"
+    ],
+    "emails": [
+      "~Daniel_Murfet1",
+      "~James_Wallbridge1",
+      "~James_Clift1"
+    ],
+    "rank": 1077
+  },
+  {
+    "url": "https://openreview.net/forum?id=l0V53bErniB",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      4,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      5,
+      5,
+      3
+    ],
+    "abstract": "The flow estimation problem consists of predicting missing edge flows in a network (e.g., traffic, power, and water) based on partial observations. These missing flows depend both on the underlying \\textit{physics} (edge features and a flow conservation law) as well as the observed edge flows. This paper introduces an optimization framework for computing missing edge flows and solves the problem using bilevel optimization and deep learning. More specifically, we learn regularizers that depend on edge features (e.g., number of lanes in a road, the resistance of a power line) using neural networks. Empirical results show that our method accurately predicts missing flows, outperforming the best baseline, and is able to capture relevant physical properties in traffic and power networks.",
+    "title": "Combining Physics and Machine Learning for Network Flow Estimation",
+    "authors": [
+      "Arlei Lopes da Silva",
+      "Furkan Kocayusufoglu",
+      "Saber Jafarpour",
+      "Francesco Bullo",
+      "Ananthram Swami",
+      "Ambuj Singh"
+    ],
+    "emails": [
+      "~Ambuj_Singh1",
+      "ucsb.edu",
+      "~Ananthram_Swami1",
+      "engineering.ucsb.edu",
+      "cs.ucsb.edu",
+      "~Arlei_Lopes_da_Silva1"
+    ],
+    "rank": 1078
+  },
+  {
+    "url": "https://openreview.net/forum?id=v8b3e5jN66j",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent methods for learning unsupervised visual representations, dubbed contrastive learning, optimize the noise-contrastive estimation (NCE) bound on mutual information between two transformations of an image. NCE typically uses randomly sampled negative examples to normalize the objective, but this may often include many uninformative examples either because they are too easy or too hard to discriminate. Taking inspiration from  metric learning, we show that choosing semi-hard negatives can yield stronger contrastive representations. To do this, we introduce a family of mutual information estimators that sample negatives conditionally -- in a \"ring\" around each positive. We prove that these estimators remain lower-bounds of mutual information, with higher bias but lower variance than NCE. Experimentally, we find our approach, applied on top of existing models (IR, CMC, and MoCo) improves accuracy by 2-5% absolute points in each case, measured by linear evaluation on four standard image benchmarks. Moreover, we find continued benefits when transferring features to a variety of new image distributions from the Meta-Dataset collection and to a variety of downstream tasks such as object detection, instance segmentation, and key-point detection.",
+    "title": "Conditional Negative Sampling for Contrastive Learning of Visual Representations",
+    "authors": [
+      "Mike Wu",
+      "Milan Mosse",
+      "Chengxu Zhuang",
+      "Daniel Yamins",
+      "Noah Goodman"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Mike_Wu1",
+      "~Daniel_Yamins1",
+      "~Chengxu_Zhuang1",
+      "~Noah_Goodman1"
+    ],
+    "rank": 1079
+  },
+  {
+    "url": "https://openreview.net/forum?id=P0p33rgyoE",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we revisit variational intrinsic control (VIC), an unsupervised reinforcement learning method for finding the largest set of intrinsic options available to an agent. In the original work by Gregor et al. (2016), two VIC algorithms were proposed: one that represents the options explicitly, and the other that does it implicitly. We show that the intrinsic reward used in the latter is subject to bias in stochastic environments, causing convergence to suboptimal solutions. To correct this behavior, we propose two methods respectively based on the transitional probability model and Gaussian Mixture Model. We substantiate our claims through rigorous mathematical derivations and experimental analyses. ",
+    "title": "Variational Intrinsic Control Revisited",
+    "authors": [
+      "Taehwan Kwon"
+    ],
+    "emails": [
+      "~Taehwan_Kwon1"
+    ],
+    "rank": 1080
+  },
+  {
+    "url": "https://openreview.net/forum?id=Py4VjN6V2JX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Contrastive self-supervised learning has delivered impressive results in many audio-visual recognition tasks. However, existing approaches optimize for learning either global representations useful for high-level understanding tasks such as classification, or local representations useful for tasks such as audio-visual source localization and separation. While they produce satisfactory results in their intended downstream scenarios, they often fail to generalize to tasks that they were not originally designed for. In this work, we propose a versatile self-supervised approach to learn audio-visual representations that can generalize to both the tasks which require global semantic information (e.g., classification) and the tasks that require fine-grained spatio-temporal information (e.g. localization). We achieve this by optimizing two cross-modal contrastive objectives that together encourage our model to learn discriminative global-local visual information given audio signals. To show that our approach learns generalizable video representations, we evaluate it on various downstream scenarios including action/sound classification, lip reading, deepfake detection, and sound source localization. ",
+    "title": "Contrastive Self-Supervised Learning of Global-Local Audio-Visual Representations",
+    "authors": [
+      "Shuang Ma",
+      "Zhaoyang Zeng",
+      "Daniel McDuff",
+      "Yale Song"
+    ],
+    "emails": [
+      "~Shuang_Ma3",
+      "~Yale_Song1",
+      "~Zhaoyang_Zeng1",
+      "~Daniel_McDuff1"
+    ],
+    "rank": 1081
+  },
+  {
+    "url": "https://openreview.net/forum?id=E4PK0rg2eP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      8
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "While task-specific finetuning of deep networks pretrained with self-supervision has led to significant empirical advances in NLP, their large size makes the standard finetuning approach difficult to apply to multi-task, memory-constrained settings, as storing the full model parameters for each task become prohibitively expensive. We propose <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">diff pruning</mtext></math></mjx-assistive-mml></mjx-container> as a simple approach to enable parameter-efficient transfer learning within the pretrain-finetune framework. This approach views finetuning as learning a task-specific diff vector that is applied on top of the pretrained parameter vector, which remains fixed and is shared across different tasks. The diff vector is adaptively pruned during training with a differentiable approximation to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-norm penalty to encourage sparsity. Diff pruning becomes parameter-efficient as the number of tasks increases, as it requires storing only the nonzero positions and weights of the diff vector for each task, while the cost of storing the shared pretrained model remains constant. We find that models finetuned with diff pruning can match the performance of fully finetuned baselines on the GLUE benchmark while only modifying 0.5<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> of the pretrained model's parameters per task.",
+    "title": "Parameter-Efficient Transfer Learning with Diff Pruning",
+    "authors": [
+      "Demi Guo",
+      "Alexander M Rush",
+      "Yoon Kim"
+    ],
+    "emails": [
+      "~Yoon_Kim1",
+      "~Demi_Guo1",
+      "~Alexander_M_Rush1"
+    ],
+    "rank": 1082
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y5TgO3J_Glc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recently, there has been significant progress designing deep generative models that generate realistic sequence data such as text or music. Nevertheless, it remains difficult to incorporate high-level structure to guide the generative process. We propose a novel approach for incorporating structure in the form of relational constraints between different subcomponents of an example (e.g., lines of a poem or measures of music).  Our generative model has two parts: (i) one model to generate a realistic set of relational constraints, and (ii) a second model to generate realistic data satisfying these constraints. To train model (i), we propose a novel program synthesis algorithm that infers the relational constraints present in the training data, and then train the models based on the resulting relational constraints.  In our experiments, we show that our approach significantly improves over state-of-the-art approaches in terms of capturing high-level structure in the data, while performing comparably or better in terms of low-level structure.",
+    "title": "Neurosymbolic Deep Generative Models for Sequence Data with Relational Constraints",
+    "authors": [
+      "Halley Young",
+      "Maxwell Du",
+      "Osbert Bastani"
+    ],
+    "emails": [
+      "~Osbert_Bastani1",
+      "seas.upenn.edu",
+      "~Halley_Young1"
+    ],
+    "rank": 1083
+  },
+  {
+    "url": "https://openreview.net/forum?id=b905-XVjbDO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      5,
+      5
+    ],
+    "rating": "5.75",
+    "confidences": [
+      3,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Injectivity plays an important role in generative models where it enables inference; in inverse problems and compressed sensing with generative priors it is a precursor to well posedness. We establish sharp characterizations of injectivity of fully-connected and convolutional ReLU layers and networks. First, through a layerwise analysis, we show that an expansivity factor of two is necessary and sufficient for injectivity by constructing appropriate weight matrices. We show that global injectivity with iid Gaussian matrices, a commonly used tractable model, requires larger expansivity between 3.4 and 10.5. We also characterize the stability of inverting an injective network via worst-case Lipschitz constants of the inverse. We then use arguments from differential topology to study injectivity of deep networks and prove that any Lipschitz map can be approximated by an injective ReLU network. Finally, using an argument based on random projections, we show that an end-to-end---rather than layerwise---doubling of the dimension suffices for injectivity. Our results establish a theoretical basis for the study of nonlinear inverse and inference problems using neural networks.",
+    "title": "Globally Injective ReLU networks",
+    "authors": [
+      "Michael Puthawala",
+      "Konik Kothari",
+      "Matti Lassas",
+      "Ivan Dokmani\u0107",
+      "Maarten V. de Hoop"
+    ],
+    "emails": [
+      "~Maarten_V._de_Hoop2",
+      "rice.edu",
+      "~Konik_Kothari1",
+      "~Ivan_Dokmani\u01071",
+      "~Matti_Lassas1"
+    ],
+    "rank": 1084
+  },
+  {
+    "url": "https://openreview.net/forum?id=k2Om84I9JuX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      9
+    ],
+    "rating": "5.74",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Choosing the optimizer is considered to be among the most crucial design decisions in deep learning, and it is not an easy one. The growing literature now lists hundreds of optimization methods. In the absence of clear theoretical guidance and conclusive empirical evidence, the decision is often made based on anecdotes. In this work, we aim to replace these anecdotes, if not with a conclusive ranking, then at least with evidence-backed heuristics. To do so, we perform an extensive, standardized benchmark of more than a dozen particularly popular deep learning optimizers while giving a concise overview of the wide range of possible choices. Analyzing almost <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>35</mn><mo>,</mo><mn>000</mn></math></mjx-assistive-mml></mjx-container> individual runs, we contribute the following three points: (i) Optimizer performance varies greatly across tasks. (ii) We observe that evaluating multiple optimizers with default parameters works approximately as well as tuning the hyperparameters of a single, fixed optimizer. (iii) While we can not discern an optimization method clearly dominating across all tested tasks, we identify a significantly reduced subset of specific algorithms and parameter choices that generally lead to competitive results in our experiments. This subset includes popular favorites and some lesser-known contenders. We have open-sourced all our experimental results, making them directly available as challenging and well-tuned baselines. This allows for more meaningful comparisons when evaluating novel optimization methods without requiring any further computational efforts.",
+    "title": "Descending through a Crowded Valley \u2014 Benchmarking Deep Learning Optimizers",
+    "authors": [
+      "Robin Marc Schmidt",
+      "Frank Schneider",
+      "Philipp Hennig"
+    ],
+    "emails": [
+      "~Frank_Schneider1",
+      "~Robin_Marc_Schmidt1",
+      "~Philipp_Hennig1"
+    ],
+    "rank": 1085
+  },
+  {
+    "url": "https://openreview.net/forum?id=vujTf_I8Kmc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.74",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "The success of deep convolutional neural networks builds on top of the learning of effective convolution operations, capturing a hierarchy of structured features via filtering, activation, and pooling. However, the explicit structured features, e.g. object parts, are not expressive in the existing CNN frameworks. In this paper, we tackle the few-shot learning problem and make an effort to enhance structured features by expanding CNNs with a constellation model, which performs cell feature clustering and encoding with a dense part representation; the relationships among the cell features are further modeled by an attention mechanism. With the additional constellation branch to increase the awareness of object parts, our method is able to attain the advantages of the CNNs while making the overall internal representations more robust in the few-shot learning setting. Our approach attains a significant improvement over the existing methods in few-shot learning on the CIFAR-FS, FC100, and mini-ImageNet benchmarks.",
+    "title": "Attentional Constellation Nets for Few-Shot Learning",
+    "authors": [
+      "Weijian Xu",
+      "yifan xu",
+      "Huaijin Wang",
+      "Zhuowen Tu"
+    ],
+    "emails": [
+      "~Huaijin_Wang1",
+      "~Zhuowen_Tu1",
+      "~yifan_xu1",
+      "~Weijian_Xu1"
+    ],
+    "rank": 1086
+  },
+  {
+    "url": "https://openreview.net/forum?id=u4WfreuXxnk",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.74",
+    "confidences": [
+      5,
+      4,
+      3,
+      5,
+      2
+    ],
+    "abstract": "Graph neural networks (GNNs) have shown broad applicability in a variety of domains.\nSome of these domains, such as social networks and product recommendations, are fertile ground for malicious users and behavior.\nIn this paper, we show that GNNs are vulnerable to the extremely limited scenario of a single-node adversarial example, where the node cannot be picked by the attacker. \nThat is, an attacker can force the GNN to classify any target node to a chosen label by only slightly perturbing another single arbitrary node in the graph, even when not being able to pick that specific attacker node. When the adversary is allowed to pick a specific attacker node, the attack is even more effective. \nWe show that this attack is effective across various GNN types (e.g., GraphSAGE, GCN, GAT, and GIN), across a variety of real-world datasets, and as a targeted and non-targeted attack.\nOur code is available anonymously at <a href=\"https://github.com/gnnattack/SINGLE\" target=\"_blank\" rel=\"nofollow\">https://github.com/gnnattack/SINGLE</a> .",
+    "title": "Single-Node Attack for Fooling Graph Neural Networks",
+    "authors": [
+      "Ben Finkelshtein",
+      "Chaim Baskin",
+      "Evgenii Zheltonozhskii",
+      "Uri Alon"
+    ],
+    "emails": [
+      "~Uri_Alon1",
+      "~Chaim_Baskin1",
+      "campus.technion.ac.il",
+      "~Evgenii_Zheltonozhskii1"
+    ],
+    "rank": 1087
+  },
+  {
+    "url": "https://openreview.net/forum?id=m2ZxDprKYlO",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper presents a novel implicit process-based meta-learning (IPML) algorithm that, in contrast to existing works, explicitly represents each task as a continuous latent vector and models its probabilistic belief within the highly expressive IP framework. Unfortunately, meta-training in IPML is computationally challenging due to its need to perform intractable exact IP inference in task adaptation. To resolve this, we propose a novel expectation-maximization algorithm based on the stochastic gradient Hamiltonian Monte Carlo sampling method to perform meta-training. Our delicate design of the neural network architecture for meta-training in IPML allows competitive meta-learning performance to be achieved. Unlike existing works, IPML offers the benefits of being amenable to the characterization of a principled distance measure between tasks using the maximum mean discrepancy, active task selection without needing the assumption of known task contexts, and synthetic task generation by modeling task-dependent input distributions. Empirical evaluation on benchmark datasets shows that IPML outperforms existing Bayesian meta-learning algorithms. We have also empirically demonstrated on an e-commerce company's real-world dataset that IPML outperforms the baselines and identifies outliers tasks which can potentially degrade meta-testing performance.",
+    "title": "Meta-Learning with Implicit Processes",
+    "authors": [
+      "YIZHOU CHEN",
+      "DONG LI",
+      "NA LI",
+      "TONG LIANG",
+      "SHIZHUO ZHANG",
+      "Bryan Kian Hsiang Low"
+    ],
+    "emails": [
+      "e.ntu.edu.sg",
+      "~Bryan_Kian_Hsiang_Low1",
+      "~YIZHOU_CHEN1",
+      "alibaba-inc.com"
+    ],
+    "rank": 1088
+  },
+  {
+    "url": "https://openreview.net/forum?id=KJSC_AsN14",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Representation learning has been greatly improved with the advance of contrastive learning methods with the performance being closer to their supervised learning counterparts. Those methods have greatly benefited from various data augmentations that are carefully designated to maintain their identities so that the images transformed from the same instance can still be retrieved. Although stronger augmentations could expose novel patterns of representations to improve their generalizability, directly using stronger augmentations in instance discrimination-based contrastive learning may even deteriorate the performance, because the distortions induced from the stronger augmentations could ridiculously change the image structures and thus the transformed images cannot be viewed as the same as the original ones any more. Additional efforts are needed for us to explore the role of the stronger augmentations in further pushing the performance of unsupervised learning to the fully supervised upper bound. Instead of applying the stronger augmentations directly to minimize the contrastive loss, we propose to minimize the distribution divergence between the weakly and strongly augmented images over the representation bank to supervise the retrieval of strongly augmented queries from a pool of candidates. This avoids an overoptimistic assumption that could overfit the strongly augmented queries containing distorted visual structures into the positive targets in the representation bank, while still being able to distinguish them from the negative samples by leveraging the distributions of weakly augmented counterparts. The proposed method achieves top-1 accuracy of 76.2% on ImageNet with a standard ResNet-50 architecture with a single-layer classifier fine-tuned. This is almost the same as 76.5% of top-1 accuracy with a fully supervised ResNet-50. Moreover, it outperforms the previous self-supervised and supervised methods on both the transfer learning and object detection tasks.\n",
+    "title": "Contrastive Learning with Stronger Augmentations",
+    "authors": [
+      "Xiao Wang",
+      "Guo-Jun Qi"
+    ],
+    "emails": [
+      "~Guo-Jun_Qi1",
+      "~Xiao_Wang6"
+    ],
+    "rank": 1089
+  },
+  {
+    "url": "https://openreview.net/forum?id=5FRJWsiLRmA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "We demonstrate that transformers obtain impressive performance even when some of the layers are randomly initialized and never updated. Inspired by old and well-established ideas in machine learning, we explore a variety of non-linear reservoir layers interspersed with regular transformer layers, and show improvements in wall-clock compute time until convergence, as well as overall performance, on various machine translation and (masked) language modelling tasks.",
+    "title": "Reservoir Transformers",
+    "authors": [
+      "Sheng Shen",
+      "Alexei Baevski",
+      "Ari S. Morcos",
+      "Kurt Keutzer",
+      "Michael Auli",
+      "Douwe Kiela"
+    ],
+    "emails": [
+      "~Michael_Auli1",
+      "~Kurt_Keutzer1",
+      "~Alexei_Baevski1",
+      "berkeley.edu",
+      "~Ari_S._Morcos1",
+      "~Douwe_Kiela1"
+    ],
+    "rank": 1090
+  },
+  {
+    "url": "https://openreview.net/forum?id=L7WD8ZdscQ5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The adaptive stochastic gradient descent (SGD) with momentum has been widely adopted in deep learning as well as convex optimization. In practice, the last iterate is commonly used as the final solution. However, the available regret analysis and the setting of constant momentum parameters only guarantee the optimal convergence of the averaged solution. In this paper, we fill this theory-practice gap by investigating the convergence of the last iterate (referred to as {\\it individual convergence}), which is a more difficult task than convergence analysis of the averaged solution. Specifically, in the constrained convex cases, we prove that the adaptive Polyak's Heavy-ball (HB) method, in which the step size is only updated using the exponential moving average strategy, attains an individual convergence rate of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msqrt size=\"s\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.189em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><msqrt><mi>t</mi></msqrt></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, as opposed to that of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mrow size=\"s\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-mrow></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msqrt size=\"s\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.189em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mfrac><mrow><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>t</mi></mrow><msqrt><mi>t</mi></msqrt></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> of SGD, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container> is the number of iterations. Our new analysis not only shows how the HB momentum and its time-varying weight help us to achieve the acceleration in convex optimization but also gives valuable hints how the momentum parameters should be scheduled in deep learning. Empirical results validate the correctness of our convergence analysis in optimizing convex functions and demonstrate the improved performance of the adaptive HB methods in training deep networks.",
+    "title": "The Role of Momentum Parameters in the Optimal Convergence of Adaptive Polyak's Heavy-ball Methods",
+    "authors": [
+      "Wei Tao",
+      "Sheng Long",
+      "Gaowei Wu",
+      "Qing Tao"
+    ],
+    "emails": [
+      "163.com",
+      "ia.ac.cn",
+      "~Wei_Tao3"
+    ],
+    "rank": 1091
+  },
+  {
+    "url": "https://openreview.net/forum?id=67q9f8gChCF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Imitation learning from limited demonstrations is challenging. Most inverse reinforcement learning (IRL) methods are unable to perform as good as the demonstrator, especially in a high-dimensional environment, e.g, the Atari domain. To address this challenge, we propose a novel reward learning method, which streamlines a differential planning module with dynamics modeling. Our method learns useful planning computations with a meaningful reward function that focuses on the resulting region of an agent executing an action. Such a planning-based reward function leads to policies with better generalization ability. Empirical results with multiple network architectures and reward instances show that our method can outperform state-of-the-art IRL methods on multiple Atari games and continuous control tasks. Our method achieves performance that is averagely 1,139.1% of the demonstration. ",
+    "title": "Learning Efficient Planning-based Rewards for Imitation Learning",
+    "authors": [
+      "Xingrui Yu",
+      "Yueming Lyu",
+      "Ivor Tsang"
+    ],
+    "emails": [
+      "~Ivor_Tsang1",
+      "~Yueming_Lyu1",
+      "~Xingrui_Yu1"
+    ],
+    "rank": 1092
+  },
+  {
+    "url": "https://openreview.net/forum?id=m08OHhXxl-5",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Classifiers deployed in high-stakes applications must output calibrated confidence scores, i.e. their predicted probabilities should reflect empirical frequencies. Typically this is achieved with recalibration algorithms that adjust probability estimates based on the real-world data; however, existing algorithms are not applicable in real-world situations where the test data follows a different distribution from the training data, and privacy preservation is paramount (e.g. protecting patient records). We introduce a framework that provides abstractions for performing recalibration under differential privacy constraints. This framework allows us to adapt existing recalibration algorithms to satisfy differential privacy while remaining effective for domain-shift situations. Guided by our framework, we also design a novel recalibration algorithm, accuracy temperature scaling, that is tailored to the requirements of differential privacy. In an extensive empirical study, we find that our algorithm improves calibration on domain-shift benchmarks under the constraints of differential privacy. On the 15 highest severity perturbations of the ImageNet-C dataset, our method achieves a median ECE of 0.029, over 2x better than the next best recalibration method and almost 5x better than without recalibration.",
+    "title": "Privacy Preserving Recalibration under Domain Shift",
+    "authors": [
+      "Rachel Luo",
+      "Shengjia Zhao",
+      "Jiaming Song",
+      "Jonathan Kuck",
+      "Stefano Ermon",
+      "Silvio Savarese"
+    ],
+    "emails": [
+      "~Silvio_Savarese1",
+      "~Jiaming_Song1",
+      "~Rachel_Luo1",
+      "~Jonathan_Kuck1",
+      "~Stefano_Ermon1",
+      "~Shengjia_Zhao1"
+    ],
+    "rank": 1093
+  },
+  {
+    "url": "https://openreview.net/forum?id=eHG7asK_v-k",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      8,
+      4
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Trust-region methods are widely used in single-agent reinforcement learning.  One advantage is that they guarantee a lower bound of monotonic payoff improvement for policy optimization at each iteration. Nonetheless, when applied in multi-agent settings, such guarantee is lost because an agent's payoff is also determined by other agents' adaptive behaviors.  In fact, measuring agents' payoff improvements in multi-agent reinforcement learning (MARL) scenarios is still challenging. Although game-theoretical solution concepts such as Nash equilibrium can be applied, the algorithm (e.g., Nash-Q learning) suffers from poor scalability beyond two-player discrete games. To mitigate the above measurability and tractability issues, in this paper, we propose Multi-Agent Trust Region Learning (MATRL) method. MATRL augments the single-agent trust-region optimization process with the multi-agent solution concept of stable fixed point that is computed at the policy-space meta-game level. When multiple agents learn simultaneously, stable fixed points at the meta-game level can effectively measure agents' payoff improvements, and, importantly, a meta-game representation enjoys better scalability for multi-player games.  We derive the lower bound of agents' payoff improvements for MATRL methods, and also prove the convergence of our method on the meta-game fixed points. We evaluate the MATRL method on both discrete and continuous multi-player general-sum games; results suggest that MATRL significantly outperforms strong MARL baselines on grid worlds, multi-agent MuJoCo, and Atari games. ",
+    "title": "Multi-Agent Trust Region Learning",
+    "authors": [
+      "Ying Wen",
+      "Hui Chen",
+      "Yaodong Yang",
+      "Zheng Tian",
+      "Minne Li",
+      "Xu Chen",
+      "Jun Wang"
+    ],
+    "emails": [
+      "~Ying_Wen1",
+      "ucl.ac.uk",
+      "~Minne_Li1",
+      "gmail.com",
+      "~Jun_Wang2",
+      "~Yaodong_Yang1"
+    ],
+    "rank": 1094
+  },
+  {
+    "url": "https://openreview.net/forum?id=3Wp8HM2CNdR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Most of the self-supervised representation learning methods are based on the contrastive loss and the instance-discrimination task, where augmented versions of the same image instance  (\"positives\") are contrasted with instances extracted from other images (\"negatives\"). For the learning to be effective, a lot of negatives should be compared with a positive pair, which is computationally demanding. In this paper, we propose a different direction and a new loss function for self-supervised representation learning which is based on the whitening of the latent-space features. The whitening operation has a \"scattering\" effect on the batch samples, which compensates the use of negatives, avoiding degenerate solutions where all the sample representations collapse to a single point. Our Whitening MSE (W-MSE) loss does not require special heuristics (e.g. additional networks) and it is conceptually simple. Since negatives are not needed, we can extract multiple positive pairs from the same image instance. We  empirically show that W-MSE is competitive with respect to popular, more complex self-supervised methods. The source code of the method and all the experiments is included in the Supplementary Material.",
+    "title": "Whitening for Self-Supervised Representation Learning",
+    "authors": [
+      "Aleksandr Ermolov",
+      "Aliaksandr Siarohin",
+      "Enver Sangineto",
+      "Nicu Sebe"
+    ],
+    "emails": [
+      "~Aliaksandr_Siarohin1",
+      "~Nicu_Sebe1",
+      "~Enver_Sangineto1",
+      "~Aleksandr_Ermolov1"
+    ],
+    "rank": 1095
+  },
+  {
+    "url": "https://openreview.net/forum?id=9Y7_c5ZAd5i",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      7,
+      4
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "abstract": "Model-based algorithms---algorithms that explore the environment through building and utilizing an estimated model---are widely used in reinforcement learning practice and theoretically shown to achieve optimal sample efficiency for single-agent reinforcement learning in Markov Decision Processes (MDPs). However, for multi-agent reinforcement learning in Markov games, the current best known sample complexity for model-based algorithms is rather suboptimal and compares unfavorably against recent model-free approaches. In this paper, we present a sharp analysis of model-based self-play algorithms for multi-agent Markov games. We design an algorithm \\emph{Optimistic Nash Value Iteration} (Nash-VI) for two-player zero-sum Markov games that is able to output an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-approximate Nash policy in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.148em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mn>3</mn></msup><mi>S</mi><mi>A</mi><mi>B</mi><mrow><mo>/</mo></mrow><msup><mi>\u03f5</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> episodes of game playing, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>S</mi></math></mjx-assistive-mml></mjx-container> is the number of states, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi><mo>,</mo><mi>B</mi></math></mjx-assistive-mml></mjx-container> are the number of actions for the two players respectively, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> is the horizon length. This significantly improves over the best known model-based guarantee of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.148em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mn>4</mn></msup><msup><mi>S</mi><mn>2</mn></msup><mi>A</mi><mi>B</mi><mrow><mo>/</mo></mrow><msup><mi>\u03f5</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, and is the first that matches the information-theoretic lower bound <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mn>3</mn></msup><mi>S</mi><mo stretchy=\"false\">(</mo><mi>A</mi><mo>+</mo><mi>B</mi><mo stretchy=\"false\">)</mo><mrow><mo>/</mo></mrow><msup><mi>\u03f5</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> except for a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo data-mjx-texclass=\"OP\" movablelimits=\"true\">min</mo><mo fence=\"false\" stretchy=\"false\">{</mo><mi>A</mi><mo>,</mo><mi>B</mi><mo fence=\"false\" stretchy=\"false\">}</mo></math></mjx-assistive-mml></mjx-container> factor. In addition, our guarantee compares favorably against the best known model-free algorithm if <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7D\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo data-mjx-texclass=\"OP\" movablelimits=\"true\">min</mo><mo fence=\"false\" stretchy=\"false\">{</mo><mi>A</mi><mo>,</mo><mi>B</mi><mo fence=\"false\" stretchy=\"false\">}</mo><mo>=</mo><mi>o</mi><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mn>3</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, and outputs a single Markov policy while existing sample-efficient model-free algorithms output a nested mixture of Markov policies that is in general non-Markov and rather inconvenient to store and execute. We further adapt our analysis to designing a provably efficient task-agnostic algorithm for zero-sum Markov games, and designing the first line of provably sample-efficient algorithms for multi-player general-sum Markov games.",
+    "title": "A Sharp Analysis of Model-based Reinforcement Learning with Self-Play",
+    "authors": [
+      "Qinghua Liu",
+      "Tiancheng Yu",
+      "Yu Bai",
+      "Chi Jin"
+    ],
+    "emails": [
+      "~Yu_Bai1",
+      "~Tiancheng_Yu1",
+      "~Qinghua_Liu1",
+      "~Chi_Jin1"
+    ],
+    "rank": 1096
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z3XVHSbSawb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Deep reinforcement learning algorithms have recently achieved significant success in learning high-performing policies from purely visual observations. The ability to perform end-to-end learning from raw high dimensional input alone has led to deep reinforcement learning algorithms being deployed in a variety of fields. Thus, understanding and improving the ability of deep reinforcement learning agents to generalize to unseen data distributions is of critical importance. Much recent work has focused on assessing the generalization of deep reinforcement learning agents by introducing specifically crafted adversarial perturbations to their inputs. In this paper, we propose another approach that we call daylight: a framework to assess the generalization skills of trained deep reinforcement learning agents. Rather than focusing on worst-case analysis of distribution shift, our approach is based on black-box perturbations that correspond to semantically meaningful changes to the environment or the agent's visual observation system ranging from brightness to compression artifacts. We demonstrate that even the smallest changes in the environment cause the performance of the agents to degrade significantly in various games from the Atari environment despite having orders of magnitude lower perceptual similarity distance compared to state-of-the-art adversarial attacks. We show that our framework captures a diverse set of bands in the Fourier spectrum, giving a better overall understanding of the agent's generalization capabilities. We believe our work can be crucial towards building resilient and generalizable deep reinforcement learning agents.",
+    "title": "Daylight: Assessing Generalization Skills of Deep Reinforcement Learning Agents",
+    "authors": [
+      "Ezgi Korkmaz"
+    ],
+    "emails": [
+      "~Ezgi_Korkmaz1"
+    ],
+    "rank": 1097
+  },
+  {
+    "url": "https://openreview.net/forum?id=qn_gk5j3PJ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Improving the robustness of neural nets in regression tasks is key to their application in multiple domains. Deep learning-based approaches aim to achieve this goal either by improving their prediction of specific values (i.e., point prediction), or by producing prediction intervals (PIs) that quantify uncertainty. We present PIVEN, a deep neural network  for producing both a PI and a prediction of specific values. Unlike previous studies, PIVEN makes no assumptions regarding data distribution inside the PI, making its point prediction more effective for various real-world problems. Benchmark experiments show that our approach produces tighter uncertainty bounds than the current state-of-the-art approach for producing PIs, while maintaining comparable performance to the state-of-the-art approach for specific value-prediction. Additional evaluation on large image datasets further support our conclusions.",
+    "title": "PIVEN: A Deep Neural Network for Prediction Intervals with Specific Value Prediction",
+    "authors": [
+      "Eli Simhayev",
+      "Gilad Katz",
+      "Lior Rokach"
+    ],
+    "emails": [
+      "~Eli_Simhayev1",
+      "post.bgu.ac.il"
+    ],
+    "rank": 1098
+  },
+  {
+    "url": "https://openreview.net/forum?id=uFk038O5wZ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Recently, people have been paying more attention to the abstractive dialogue summarization task. Compared with news text, the information flows of the dialogue exchange between at least two interlocutors, which leads to the necessity of capturing long-distance cross-sentence relations. In addition, the generated summaries commonly suffer from fake facts because the key elements of dialogues often scatter in multiple utterances. However, the existing sequence-to-sequence models are difficult to address these issues. Therefore, it is necessary for researchers to explore the implicit conversational structure to ensure the richness and faithfulness of generated contents. In this paper, we present a Knowledge Graph Enhanced Dual-Copy network (KGEDC), a novel framework for abstractive dialogue summarization with conversational structure and factual knowledge. We use a sequence encoder to draw local features and a graph encoder to integrate global features via the sparse relational graph self-attention network, complementing each other. Besides, a dual-copy mechanism is also designed in decoding process to force the generation conditioned on both the source text and extracted factual knowledge. The experimental results show that our method produces significantly higher ROUGE scores than most of the baselines on both SAMSum corpus and Automobile Master corpus. Human judges further evaluate that outputs of our model contain more richer and faithful information.",
+    "title": "Improving Abstractive Dialogue Summarization with Conversational Structure and Factual Knowledge",
+    "authors": [
+      "Lulu Zhao",
+      "Zeyuan Yang",
+      "Weiran Xu",
+      "Sheng Gao",
+      "Jun Guo"
+    ],
+    "emails": [
+      "~Jun_Guo1",
+      "~Sheng_Gao1",
+      "bupt.edu.cn",
+      "~Lulu_Zhao1"
+    ],
+    "rank": 1099
+  },
+  {
+    "url": "https://openreview.net/forum?id=cuDFRRANJ-5",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      7,
+      3
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Studying the sensitivity of weight perturbation in neural networks and its impacts on model performance, including generalization and robustness, is an active research topic due to its implications on a wide range of machine learning tasks such as model compression, generalization gap assessment, and adversarial attacks. In this paper, we provide the first formal analysis for feed-forward neural networks with non-negative monotone activation functions against norm-bounded weight perturbations, in terms of the robustness in pairwise class margin functions and the Rademacher complexity for generalization. We further design a new theory-driven loss function for training generalizable and robust neural networks against weight perturbations. Empirical experiments are conducted to validate our theoretical analysis. Our results offer fundamental insights for characterizing the generalization and robustness of neural networks against weight perturbations.",
+    "title": "Formalizing Generalization and Robustness of Neural Networks to Weight Perturbations",
+    "authors": [
+      "Yu-Lin Tsai",
+      "Chia-Yi Hsu",
+      "Chia-Mu Yu",
+      "Pin-Yu Chen"
+    ],
+    "emails": [
+      "~Pin-Yu_Chen1",
+      "~Chia-Mu_Yu1",
+      "gmail.com"
+    ],
+    "rank": 1100
+  },
+  {
+    "url": "https://openreview.net/forum?id=bXLMnw03KPz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Several notions of fairness, such as demographic parity and equal opportunity, are defined based on statistical independence between a predicted target and a sensitive attribute. In machine learning applications, however, the data distribution is unknown to the learner and statistical independence is not verifiable. Hence, the learner could only resort to empirical evaluation of the degree of fairness violation. Many fairness violation notions are defined as a divergence/distance  between the joint distribution of the target and  sensitive attributes and the Kronecker product of their marginals, such as \\Renyi correlation,  mutual information, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> distance, to name a few.\n In this paper, we propose  another notion of fairness violation, called Exponential R\\'enyi Mutual Information (ERMI) between sensitive attributes and the predicted target. We show that ERMI is a strong fairness violation notion in the sense that it provides an upper bound guarantee on all of the aforementioned notions of fairness violation. We also propose the Fair Empirical Risk Minimization via ERMI regularization framework, called FERMI. Whereas existing in-processing fairness algorithms are deterministic, we provide a stochastic optimization method for solving FERMI that is amenable to large-scale problems. In addition, we provide a batch (deterministic) method to solve FERMI. Both of our proposed algorithms come with theoretical convergence guarantees. Our experiments show that FERMI achieves the most favorable tradeoffs between fairness violation and accuracy on test data across different problem setups, even when fairness violation is measured in notions other than ERMI. ",
+    "title": "FERMI: Fair Empirical Risk Minimization via Exponential R\u00e9nyi Mutual Information",
+    "authors": [
+      "Rakesh Pavan",
+      "Andrew Lowy",
+      "Sina Baharlouei",
+      "Meisam Razaviyayn",
+      "Ahmad Beirami"
+    ],
+    "emails": [
+      "~Ahmad_Beirami1",
+      "usc.edu",
+      "~Rakesh_Pavan1",
+      "~Meisam_Razaviyayn1",
+      "~Sina_Baharlouei1"
+    ],
+    "rank": 1101
+  },
+  {
+    "url": "https://openreview.net/forum?id=TiGF63rxr8Q",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      2,
+      2,
+      4
+    ],
+    "abstract": "One of the main challenges in real-world reinforcement learning is to learn successfully from limited training samples. We show that in certain settings, the available data can be dramatically increased through a form of multi-task learning, by exploiting an invariance property in the tasks. We provide a theoretical performance bound for the gain in sample efficiency under this setting. This motivates a new approach to multi-task learning, which involves the design of an appropriate neural network architecture and a prioritized task-sampling strategy. We demonstrate empirically the effectiveness of the proposed approach on two real-world sequential resource allocation tasks where this invariance property occurs: financial portfolio optimization and meta federated learning.",
+    "title": "Efficient Reinforcement Learning in Resource Allocation Problems Through Permutation Invariant Multi-task Learning",
+    "authors": [
+      "Desmond Cai",
+      "Shiau Hong Lim",
+      "Laura Wynter"
+    ],
+    "emails": [
+      "~Laura_Wynter1",
+      "gmail.com",
+      "~Shiau_Hong_Lim1"
+    ],
+    "rank": 1102
+  },
+  {
+    "url": "https://openreview.net/forum?id=wjJ3pR-ZQD",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      8,
+      6,
+      4,
+      4
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      1,
+      3,
+      4
+    ],
+    "abstract": "Graph matching (GM) has been traditionally modeled as a deterministic optimization problem characterized by an affinity matrix under pre-defined graph topology. Though there have been several attempts on learning more effective node-level affinity/representation for matching, they still heavily rely on the initial graph structure/topology which is typically obtained through heuristic ways (e.g. Delauney or <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-nearest) and will not be adjusted during the learning process to adapt to problem-specific patterns. We argue that a standalone graph representation learning is insufficient for GM task, whereby a GM solver may favor some latent topology other than pre-defined one. Motivated by this hypothesis, we propose to learn latent graph topology in replacement of the fixed topology as input. To this end, we devise two types of latent graph generation procedures in deterministic and generative fashion, respectively. Particularly, the generative procedure emphasizes the across-graph consistency and thus can be viewed as a \\textbf{co-generative} model. Our methods show superior performance over previous state-of-the-arts on several benchmarks, thus strongly supporting our hypothesis.",
+    "title": "Learning Latent Topology for Graph Matching",
+    "authors": [
+      "Tianshu Yu",
+      "Runzhong Wang",
+      "Junchi Yan",
+      "Baoxin Li"
+    ],
+    "emails": [
+      "~Baoxin_Li1",
+      "~Junchi_Yan2",
+      "~Runzhong_Wang1",
+      "~Tianshu_Yu2"
+    ],
+    "rank": 1103
+  },
+  {
+    "url": "https://openreview.net/forum?id=LjFGgI-_tT0",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Despite their theoretical appealingness, Bayesian neural networks (BNNs) are falling far behind in terms of adoption in real-world applications compared with normal NNs, mainly due to their limited scalability in training, and low fidelity in their uncertainty estimates. In this work, we develop a new framework, named BayesAdapter, to address these issues and bring Bayesian deep learning to the masses. The core notion of BayesAdapter is to adapt pre-trained deterministic NNs to be BNNs via Bayesian fine-tuning. We implement Bayesian fine-tuning with a plug-and-play instantiation of stochastic variational inference, and propose exemplar reparameterization to reduce gradient variance and stabilize the fine-tuning. Together, they enable training BNNs as if one were training deterministic NNs with minimal added overheads. During Bayesian fine-tuning, we further propose an uncertainty regularization to supervise and calibrate the uncertainty quantification of learned BNNs at low cost. To empirically evaluate BayesAdapter, we conduct extensive experiments on a diverse set of challenging benchmarks, and observe satisfactory training efficiency, competitive predictive performance, and calibrated and faithful uncertainty estimates. ",
+    "title": "BayesAdapter: Being Bayesian, Inexpensively and Robustly, via Bayesian Fine-tuning",
+    "authors": [
+      "Zhijie Deng",
+      "Xiao Yang",
+      "Hao Zhang",
+      "Yinpeng Dong",
+      "Jun Zhu"
+    ],
+    "emails": [
+      "~Yinpeng_Dong2",
+      "~Jun_Zhu2",
+      "~Xiao_Yang4",
+      "~Zhijie_Deng1",
+      "~Hao_Zhang2"
+    ],
+    "rank": 1104
+  },
+  {
+    "url": "https://openreview.net/forum?id=w2Z2OwVNeK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      3
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In high-dimensional state spaces, the usefulness of Reinforcement Learning (RL) is limited by the problem of exploration. This issue has been addressed using potential-based reward shaping (PB-RS) previously. In the present work, we introduce Final-Volume-Preserving Reward Shaping (FV-RS). FV-RS relaxes the strict optimality guarantees of PB-RS to a guarantee of preserved long-term behavior. Being less restrictive, FV-RS allows for reward shaping functions that are even better suited for improving the sample efficiency of RL algorithms. In particular, we consider settings in which the agent has access to an approximate plan. Here, we use examples of simulated robotic manipulation tasks to demonstrate that plan-based FV-RS can indeed significantly improve the sample efficiency of RL over plan-based PB-RS.",
+    "title": "Plan-Based Relaxed Reward Shaping for Goal-Directed Tasks",
+    "authors": [
+      "Ingmar Schubert",
+      "Ozgur S Oguz",
+      "Marc Toussaint"
+    ],
+    "emails": [
+      "~Ingmar_Schubert1",
+      "~Ozgur_S_Oguz1",
+      "~Marc_Toussaint1"
+    ],
+    "rank": 1105
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZKyd0bkFmom",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "One of the main challenges in current systems neuroscience is the analysis of high-dimensional neuronal and behavioral data that are characterized by different statistics and timescales of the recorded variables. We propose a parametric copula model which separates the statistics of the individual variables from their dependence structure, and escapes the curse of dimensionality by using vine copula constructions. We use a Bayesian framework with Gaussian Process (GP) priors over copula parameters, conditioned on a continuous task-related variable. We improve the flexibility of this method by 1) using non-parametric conditional (rather than unconditional) marginals; 2) linearly mixing copula elements with qualitatively different tail dependencies. We validate the model on synthetic data and compare its performance in estimating mutual information against the commonly used non-parametric algorithms. Our model provides accurate information estimates when the dependencies in the data match the parametric copulas used in our framework. Moreover, even when the exact density estimation with a parametric model is not possible, our Copula-GP model is still able to provide reasonable information estimates, close to the ground truth and comparable to those obtained with a neural network estimator. Finally, we apply our framework to real neuronal and behavioral recordings obtained in awake mice. We demonstrate the ability of our framework to 1) produce accurate and interpretable bivariate models for the analysis of inter-neuronal noise correlations or behavioral modulations; 2) expand to more than 100 dimensions and measure information content in the whole-population statistics. These results demonstrate that the Copula-GP framework is particularly useful for the analysis of complex multidimensional relationships between neuronal, sensory and behavioral data.",
+    "title": "Parametric Copula-GP model for analyzing multidimensional neuronal and behavioral relationships",
+    "authors": [
+      "Nina Kudryashova",
+      "Theoklitos Amvrosiadis",
+      "Nathalie Dupuy",
+      "Nathalie Rochefort",
+      "Arno Onken"
+    ],
+    "emails": [
+      "~Nina_Kudryashova1",
+      "ed.ac.uk",
+      "~Arno_Onken1"
+    ],
+    "rank": 1106
+  },
+  {
+    "url": "https://openreview.net/forum?id=lf7st0bJIA5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We study the problem of unsupervised physical object discovery. While existing frameworks aim to decompose scenes into 2D segments based off each object's appearance, we explore how physics, especially object interactions, facilitates disentangling of 3D geometry and position of objects from video, in an unsupervised manner. Drawing inspiration from developmental psychology, our Physical Object Discovery Network (POD-Net) uses both multi-scale pixel cues and physical motion cues to accurately segment observable and partially occluded objects of varying sizes, and infer properties of those objects. Our model reliably segments objects on both synthetic and real scenes.  The discovered object properties can also be used to reason about physical events.",
+    "title": "Unsupervised Discovery of 3D Physical Objects",
+    "authors": [
+      "Yilun Du",
+      "Kevin A. Smith",
+      "Tomer Ullman",
+      "Joshua B. Tenenbaum",
+      "Jiajun Wu"
+    ],
+    "emails": [
+      "~Jiajun_Wu1",
+      "~Kevin_A._Smith1",
+      "~Joshua_B._Tenenbaum1",
+      "~Tomer_Ullman1",
+      "~Yilun_Du1"
+    ],
+    "rank": 1107
+  },
+  {
+    "url": "https://openreview.net/forum?id=L4n9FPoQL1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "The scarcity of class-labeled data is a ubiquitous bottleneck in a wide range of machine learning problems. While abundant unlabeled data normally exist and provide a potential solution, it is extremely challenging to exploit them. In this paper, we address this problem by leveraging Positive-Unlabeled~(PU) classification and conditional generation with extra unlabeled data \\emph{simultaneously}, both of which aim to make full use of agnostic unlabeled data to improve classification and generation performances. In particular, we present a novel training framework to jointly target both PU classification and conditional generation when exposing to extra data, especially out-of-distribution unlabeled data, by exploring the interplay between them: 1) enhancing the performance of PU classifiers with the assistance of a novel Conditional Generative Adversarial Network~(CGAN) that is robust to noisy labels, 2) leveraging extra data with predicted labels from a PU classifier to help the generation. Our key contribution is a Classifier-Noise-Invariant Conditional GAN~(CNI-CGAN) that can learn the clean data distribution from noisy labels predicted by a PU classifier. Theoretically, we proved the optimal condition of CNI-CGAN and experimentally, we conducted extensive evaluations on diverse datasets, verifying the simultaneous improvements on both classification and generation.",
+    "title": "Classify and Generate Reciprocally: Simultaneous Positive-Unlabelled Learning and Conditional Generation with Extra Data",
+    "authors": [
+      "Bing Yu",
+      "Ke Sun",
+      "He Wang",
+      "Zhouchen Lin",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "~Zhanxing_Zhu1",
+      "~Zhouchen_Lin1",
+      "~He_Wang6",
+      "~Ke_Sun3",
+      "~Bing_Yu1"
+    ],
+    "rank": 1108
+  },
+  {
+    "url": "https://openreview.net/forum?id=ygWoT6hOc28",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Prior Networks are a recently developed class of models which yield interpretable measures of uncertainty and have been shown to outperform state-of-the-art ensemble approaches on a range of tasks. They can also be used to distill an ensemble of models via \\emph{Ensemble Distribution Distillation} (EnD<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>), such that its accuracy, calibration and uncertainty estimates are retained within a single model. However, Prior Networks have so far been developed only for classification tasks. This work extends Prior Networks and EnD<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> to regression tasks by considering the Normal-Wishart distribution. The properties of Regression Prior Networks are demonstrated on synthetic data, selected UCI datasets and a monocular depth estimation task, where they yield performance competitive with ensemble approaches.",
+    "title": "Regression Prior Networks",
+    "authors": [
+      "Andrey Malinin",
+      "Sergey Chervontsev",
+      "Ivan Provilkov",
+      "Mark Gales"
+    ],
+    "emails": [
+      "yandex-team.ru",
+      "~Andrey_Malinin1",
+      "~Mark_Gales1"
+    ],
+    "rank": 1109
+  },
+  {
+    "url": "https://openreview.net/forum?id=tv8n52XbO4p",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      5,
+      4,
+      5,
+      1
+    ],
+    "abstract": "Adversarial learning has emerged as one of the successful techniques to circumvent the susceptibility of existing methods against adversarial perturbations. However, the majority of existing defense methods are tailored to defend against a single category of adversarial perturbation (e.g. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container>-attack). In safety-critical applications, this makes these methods extraneous as the attacker can adopt diverse adversaries to deceive the system. Moreover, training on multiple perturbations simultaneously significantly increases the computational overhead during training. To address these challenges, we propose a novel meta-learning framework that explicitly learns to generate noise to improve the model's robustness against multiple types of attacks. Its key component is Meta Noise Generator (MNG) that outputs optimal noise to stochastically perturb a given sample, such that it helps lower the error on diverse adversarial perturbations. By utilizing samples generated by MNG, we train a model by enforcing the label consistency across multiple perturbations. We validate the robustness of models trained by our scheme on various datasets and against a wide variety of perturbations, demonstrating that it significantly outperforms the baselines across multiple perturbations with a marginal computational cost.",
+    "title": "Learning to Generate Noise for Multi-Attack Robustness",
+    "authors": [
+      "Divyam Madaan",
+      "Jinwoo Shin",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Jinwoo_Shin1",
+      "~Divyam_Madaan1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 1110
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ew0zR07CYRd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Adversarial attacks against deep neural networks have been widely studied. Adversarial examples for deep reinforcement learning (DeepRL) have significant security implications, due to the deployment of these algorithms in many application domains. In this work we formalize an optimal myopic adversary for deep reinforcement learning agents. Our adversary attempts to find a bounded perturbation of the state which minimizes the value of the action taken by the agent. We show with experiments in various games in the Atari environment that our attack formulation achieves significantly larger impact as compared to the current  state-of-the-art. Furthermore, this enables us to lower the bounds by several orders of magnitude on the perturbation needed to efficiently achieve significant impacts on DeepRL agents.",
+    "title": "Bounded Myopic Adversaries for Deep Reinforcement Learning Agents",
+    "authors": [
+      "Ezgi Korkmaz",
+      "Henrik Sandberg",
+      "Gyorgy Dan"
+    ],
+    "emails": [
+      "kth.se",
+      "~Ezgi_Korkmaz1"
+    ],
+    "rank": 1111
+  },
+  {
+    "url": "https://openreview.net/forum?id=HP-tcf48fT",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Detecting the Maximum Common Subgraph (MCS) between two input graphs is fundamental for applications in biomedical analysis, malware detection, cloud computing, etc. This is especially important in the task of drug design, where the successful extraction of common substructures in compounds can reduce the number of experiments needed to be conducted by humans. However, MCS computation is NP-hard, and state-of-the-art MCS solvers rely on heuristics in search which in practice cannot find good solution for large graph pairs under a limited search budget. Here we propose GLSearch, a Graph Neural Network based model for MCS detection, which learns to search. Our model uses a state-of-the-art branch and bound algorithm as the backbone search algorithm to extract subgraphs by selecting one node pair at a time. In order to make better node selection decision at each step, we replace the node selection heuristics with a novel task-specific Deep Q-Network (DQN), allowing the search process to find larger common subgraphs faster. To enhance the training of DQN, we leverage the search process to provide supervision in a pre-training stage and guide our agent during an imitation learning stage. Therefore, our framework allows search and reinforcement learning to mutually benefit each other. Experiments on synthetic and real-world large graph pairs demonstrate that our model outperforms state-of-the-art MCS solvers and neural graph matching network models.",
+    "title": "Learning to Search for Fast Maximum Common Subgraph Detection",
+    "authors": [
+      "Yunsheng Bai",
+      "Derek Qiang Xu",
+      "Yizhou Sun",
+      "Wei Wang"
+    ],
+    "emails": [
+      "~Wei_Wang13",
+      "~Derek_Qiang_Xu2",
+      "~Yizhou_Sun1",
+      "~Yunsheng_Bai1"
+    ],
+    "rank": 1112
+  },
+  {
+    "url": "https://openreview.net/forum?id=guetrIHLFGI",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.73",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "We propose a new framework for reasoning about generalization in deep learning. \nThe core idea is to couple the Real World, where optimizers take stochastic gradient steps on the empirical loss, to an Ideal World, where optimizers take steps on the population loss. This leads to an alternate decomposition of test error into: (1) the Ideal World test error plus (2) the gap between the two worlds. If the gap (2) is universally small, this reduces the problem of generalization in offline learning to the problem of optimization in online learning.\nWe then give empirical evidence that this gap between worlds can be small in realistic deep learning settings, in particular supervised image classification. For example, CNNs generalize better than MLPs on image distributions in the Real World, but this is \"because\" they optimize faster on the population loss in the Ideal World. This suggests our framework is a useful tool for understanding generalization in deep learning, and lays the foundation for future research in this direction. ",
+    "title": "The Deep Bootstrap Framework: Good Online Learners are Good Offline Generalizers",
+    "authors": [
+      "Preetum Nakkiran",
+      "Behnam Neyshabur",
+      "Hanie Sedghi"
+    ],
+    "emails": [
+      "~Hanie_Sedghi1",
+      "~Behnam_Neyshabur1",
+      "~Preetum_Nakkiran1"
+    ],
+    "rank": 1113
+  },
+  {
+    "url": "https://openreview.net/forum?id=Nc3TJqbcl3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      2,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Solving high-dimensional, continuous robotic tasks is a challenging optimization problem. Model-based methods that rely on zero-order optimizers like the cross-entropy method (CEM) have so far shown strong performance and are considered state-of-the-art in the model-based reinforcement learning community. However, this success comes at the cost of high computational complexity, being therefore not suitable for real-time control. In this paper, we propose a technique to jointly optimize the trajectory and distill a policy, which is essential for fast execution in real robotic systems. Our method builds upon standard approaches, like guidance cost and dataset aggregation, and introduces a novel adaptive factor which prevents the optimizer from collapsing to the learner's behavior at the beginning of the training. The extracted policies reach unprecedented performance on challenging tasks as making a humanoid stand up and opening a door without reward shaping",
+    "title": "Extracting Strong Policies for Robotics Tasks from Zero-Order Trajectory Optimizers",
+    "authors": [
+      "Cristina Pinneri",
+      "Shambhuraj Sawant",
+      "Sebastian Blaes",
+      "Georg Martius"
+    ],
+    "emails": [
+      "~Cristina_Pinneri1",
+      "tuebingen.mpg.de",
+      "~Georg_Martius1"
+    ],
+    "rank": 1114
+  },
+  {
+    "url": "https://openreview.net/forum?id=-YCAwPdyPKw",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Humans are capable of reasoning about physical phenomena by inferring laws of physics from a very limited set of observations. The inferred laws can potentially depend on unobserved properties, such as mass, texture, charge, etc. This sample-efficient physical reasoning is considered a core domain of human common-sense knowledge and hints at the existence of a physics engine in the head. In this paper, we propose a Bayesian symbolic framework for learning sample-efficient models of physical reasoning and prediction, which are of special interests in the field of intuitive physics. In our framework, the environment is represented by a top-down generative model with a collection of entities with some known and unknown properties as latent variables to capture uncertainty. The physics engine depends on physical laws which are modeled as interpretable symbolic expressions and are assumed to be functions of the latent properties of the entities interacting under simple Newtonian physics. As such, learning the laws is then reduced to symbolic regression and Bayesian inference methods are used to obtain the distribution of unobserved properties. These inference and regression steps are performed in an iterative manner following the expectation\u2013maximization algorithm to infer the unknown properties and use them to learn the laws from a very small set of observations. We demonstrate that on three physics learning tasks that compared to the existing methods of learning physics, our proposed framework is more data-efficient, accurate and makes joint reasoning and learning possible.",
+    "title": "A Bayesian-Symbolic Approach to Learning and Reasoning for Intuitive Physics",
+    "authors": [
+      "Kai Xu",
+      "Akash Srivastava",
+      "Dan Gutfreund",
+      "Felix Sosa",
+      "Tomer Ullman",
+      "Joshua B. Tenenbaum",
+      "Charles Sutton"
+    ],
+    "emails": [
+      "~Kai_Xu4",
+      "~Akash_Srivastava1",
+      "~Charles_Sutton1",
+      "fas.harvard.edu",
+      "~Dan_Gutfreund1",
+      "~Joshua_B._Tenenbaum1",
+      "~Tomer_Ullman1"
+    ],
+    "rank": 1115
+  },
+  {
+    "url": "https://openreview.net/forum?id=mLeIhe67Li6",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We analyze the learning problem of fully connected neural networks with the sigmoid activation function for binary classification in the teacher-student setup, where the outputs are assumed to be generated by a ground-truth teacher neural network with unknown parameters, and the learning objective is to estimate the teacher network model by minimizing a non-convex cross-entropy risk function of the training data over a student neural network. This paper analyzes a  general and practical scenario that the input features follow a Gaussian mixture model of a finite number of Gaussian distributions of various mean and variance. We propose a gradient descent algorithm with a tensor initialization approach and show that our algorithm converges linearly to a critical point that has a  diminishing distance to the ground-truth model with guaranteed generalizability. We characterize the required number of samples for successful convergence, referred to as the sample complexity,  as a function of the parameters of the Gaussian mixture model. We prove analytically that when any mean or variance in the mixture model is large, or when all variances are close to zero, the sample complexity increases, and the convergence slows down, indicating a more challenging learning problem. Although focusing on one-hidden-layer neural networks, to the best of our knowledge, this paper provides the first explicit characterization of the impact of the parameters of the input distributions on the sample complexity and learning rate.",
+    "title": "Learning One-hidden-layer Neural Networks on Gaussian Mixture Models with Guaranteed Generalizability",
+    "authors": [
+      "Hongkang Li",
+      "Shuai Zhang",
+      "Meng Wang"
+    ],
+    "emails": [
+      "~Hongkang_Li1",
+      "~Shuai_Zhang6",
+      "~Meng_Wang4"
+    ],
+    "rank": 1116
+  },
+  {
+    "url": "https://openreview.net/forum?id=IW-EI6BCxy",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Few-shot meta-learning methods consider the problem of learning new tasks from a small, fixed number of examples, by meta-learning across static data from a set of previous tasks. However, in many real world settings, it is more natural to view the problem as one of minimizing the total amount of supervision --- both the number of examples needed to learn a new task and the amount of data needed for meta-learning. Such a formulation can be studied in a sequential learning setting, where tasks are presented in sequence. When studying meta-learning in this online setting, a critical question arises: can meta-learning improve over the sample complexity and regret of standard empirical risk minimization methods, when considering both meta-training and adaptation together? The answer is particularly non-obvious for meta-learning algorithms with complex bi-level optimizations that may demand large amounts of meta-training data. To answer this question, we extend previous meta-learning algorithms to handle the variable-shot settings that naturally arise in sequential learning: from many-shot learning at the start, to zero-shot learning towards the end. On sequential learning problems, we find that meta-learning solves the full task set with fewer overall labels and achieves greater cumulative performance, compared to standard supervised methods. These results suggest that meta-learning is an important ingredient for building learning systems that continuously learn and improve over a sequence of problems.",
+    "title": "Variable-Shot Adaptation for Online Meta-Learning",
+    "authors": [
+      "Tianhe Yu",
+      "Xinyang Geng",
+      "Chelsea Finn",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Xinyang_Geng1",
+      "~Chelsea_Finn1",
+      "~Sergey_Levine1",
+      "~Tianhe_Yu1"
+    ],
+    "rank": 1117
+  },
+  {
+    "url": "https://openreview.net/forum?id=1flmvXGGJaa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      7,
+      3
+    ],
+    "rating": "5.72",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "The most significant barrier to the advancement of Neural Architecture Search (NAS) is its demand for large computational resources, which hinders scientifically sound empirical evaluations. As a remedy, several tabular NAS benchmarks were proposed to simulate runs of NAS methods in seconds.  However, all existing tabular NAS benchmarks are limited to extremely small architectural spaces since they rely on exhaustive evaluations of the space. This leads to unrealistic results that do not transfer to larger search spaces. To overcome this fundamental limitation, we propose NAS-Bench-301, the first surrogate NAS benchmark, using a search space containing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mn>10</mn><mrow><mn>18</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> architectures, many orders of magnitude larger than any previous tabular NAS benchmark. After motivating the benefits of a surrogate benchmark over a tabular one, we fit various regression models on our dataset, which consists of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>60k architecture evaluations, and build surrogates via deep ensembles to model uncertainty. We benchmark a wide range of NAS algorithms using NAS-Bench-301 and obtain comparable results to the true benchmark at a fraction of the real cost. Finally, we show how NAS-Bench-301 can be used to generate new scientific insights.",
+    "title": "NAS-Bench-301 and the Case for Surrogate Benchmarks for Neural Architecture Search",
+    "authors": [
+      "Julien Niklas Siems",
+      "Lucas Zimmer",
+      "Arber Zela",
+      "Jovita Lukasik",
+      "Margret Keuper",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "~Frank_Hutter1",
+      "~Arber_Zela1",
+      "~Lucas_Zimmer1",
+      "~Margret_Keuper1",
+      "~Julien_Niklas_Siems1",
+      "~Jovita_Lukasik1"
+    ],
+    "rank": 1118
+  },
+  {
+    "url": "https://openreview.net/forum?id=TYXs_y84xRj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      8,
+      3,
+      6
+    ],
+    "rating": "5.72",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "A variety of anchor-free object detectors have been actively proposed as possible alternatives to the mainstream anchor-based detectors that often rely on complicated design of anchor boxes. Despite achieving promising performance on par with anchor-based detectors, the existing anchor-free detectors such as FCOS or CenterNet predict objects based on standard Cartesian coordinates, which often yield poor quality keypoints. Further, the feature representation is also scale-sensitive. In this paper, we propose a new anchor-free keypoint based detector ``PolarNet\", where keypoints are represented as a set of Polar coordinates instead of Cartesian coordinates. The ``PolarNet\" detector learns offsets pointing to the corners of objects in order to learn high quality keypoints. Additionally, PolarNet uses features of corner points to localize objects, making the localization scale-insensitive. Finally in our experiments, we show that PolarNet, an anchor-free detector, outperforms the existing anchor-free detectors, and it is able to achieve highly competitive result on COCO test-dev benchmark (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>47.8</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>50.3</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> AP under the single-model single-scale and multi-scale testing) which is on par with the state-of-the-art two-stage anchor-based object detectors. The code and the models are available at <a href=\"https://github.com/XiongweiWu/PolarNetV1\" target=\"_blank\" rel=\"nofollow\">https://github.com/XiongweiWu/PolarNetV1</a>",
+    "title": "PolarNet: Learning to Optimize Polar Keypoints for Keypoint Based Object Detection",
+    "authors": [
+      "Wu Xiongwei",
+      "Doyen Sahoo",
+      "Steven HOI"
+    ],
+    "emails": [
+      "~Wu_Xiongwei1",
+      "~Steven_HOI1",
+      "~Doyen_Sahoo1"
+    ],
+    "rank": 1119
+  },
+  {
+    "url": "https://openreview.net/forum?id=0SPUQoRMAvc",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.72",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Self-supervised depth estimation has shown its great effectiveness in producing high quality depth maps given only image sequences as input. However, its performance usually drops when estimating on border areas or objects with thin structures due to the limited depth representation ability. In this paper, we address this problem by proposing a semantic-guided depth representation enhancement method, which promotes both local and global depth feature representations by leveraging rich contextual information. In stead of a single depth network as used in conventional paradigms, we propose an extra semantic segmentation branch to offer extra contextual features for depth estimation. Based on this framework, we enhance the local feature representation by sampling and feeding the point-based features that locate on the semantic edges to an individual Semantic-guided Edge Enhancement module (SEEM), which is specifically designed for promoting depth estimation on the challenging semantic borders. Then, we improve the global feature representation by proposing a semantic-guided multi-level attention mechanism, which enhances the semantic and depth features by exploring pixel-wise correlations in the multi-level depth decoding scheme. Extensive experiments validate the distinct superiority of our method in capturing highly accurate depth on the challenging image areas such as semantic category borders and thin objects. Both quantitative and qualitative experiments on KITTI show that our method outperforms the state-of-the-art methods.",
+    "title": "Semantic-Guided Representation Enhancement for Self-supervised Monocular Trained Depth Estimation",
+    "authors": [
+      "Rui Li",
+      "Qing Mao",
+      "Pei Wang",
+      "Xiantuo He",
+      "Yu Zhu",
+      "Jinqiu Sun",
+      "Yanning Zhang"
+    ],
+    "emails": [
+      "~Rui_Li10",
+      "nwpu.edu.cn",
+      "mail.nwpu.edu.cn",
+      "~Yanning_Zhang1",
+      "foxmail.com"
+    ],
+    "rank": 1120
+  },
+  {
+    "url": "https://openreview.net/forum?id=xxWl2oEvP2h",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.72",
+    "confidences": [
+      5,
+      5,
+      5,
+      3
+    ],
+    "abstract": " The large-scale vehicle routing problems are defined based on the classical VRPs with thousands of customers. It is of great importance to find an efficient and high-quality solution for real-world applications. However, existing algorithms for VRPs including non-learning heuristics and RL-based methods, only perform well on small-scale instances with usually no more than a hundred customers. They are unable to solve large-scale VRPs due to either high computation cost or explosive solution space that results in model divergence. \nInspired by the classical idea of Divide-and-Conquer, we present a novel Rewriting-by-Generating(RBG) framework with hierarchical RL agents to solve large-scale VRPs. RBG consists of a rewriter agent that refines the customer division globally and an elementary generator to infer regional solutions locally. Extensive experiments demonstrate the effectiveness and efficiency of our proposed RBG framework. It outperforms LKH3, the state-of-the-art method for CVRPs, by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2.43</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> when customer number <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi><mo>=</mo><mn>2000</mn></math></mjx-assistive-mml></mjx-container> and shortens the inference time by about 100 times.",
+    "title": "Rewriting by Generating: Learn Heuristics for Large-scale Vehicle Routing Problems",
+    "authors": [
+      "Hansen Wang",
+      "Zefang Zong",
+      "Tong Xia",
+      "Shuyu Luo",
+      "Meng Zheng",
+      "Depeng Jin",
+      "Yong Li"
+    ],
+    "emails": [
+      "~Tong_Xia1",
+      "~Yong_Li3",
+      "tsinghua.edu.cn",
+      "hitachi.cn",
+      "~Zefang_Zong1",
+      "~Hansen_Wang1"
+    ],
+    "rank": 1121
+  },
+  {
+    "url": "https://openreview.net/forum?id=a2rFihIU7i",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.72",
+    "confidences": [
+      2,
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We introduce a model-based asynchronous multi-fidelity method for hyperparameter and neural architecture search that combines the strengths of asynchronous Successive Halving and Gaussian process-based Bayesian optimization. At the heart of our method is a probabilistic model that can simultaneously reason across hyperparameters and resource levels, and supports decision-making in the presence of pending evaluations. We demonstrate the effectiveness of our method on a wide range of challenging benchmarks, for tabular data, image classification and language modelling, and report substantial speed-ups over current state-of-the-art methods. Our new methods, along with asynchronous baselines, are implemented in a distributed framework which will be open sourced along with this publication.",
+    "title": "Model-based Asynchronous Hyperparameter and Neural Architecture Search",
+    "authors": [
+      "Aaron Klein",
+      "Louis Chi-Chun Tiao",
+      "Thibaut Lienart",
+      "Cedric Archambeau",
+      "Matthias Seeger"
+    ],
+    "emails": [
+      "~Matthias_Seeger2",
+      "~Louis_Chi-Chun_Tiao1",
+      "~Cedric_Archambeau1",
+      "~Thibaut_Lienart1",
+      "~Aaron_Klein1"
+    ],
+    "rank": 1122
+  },
+  {
+    "url": "https://openreview.net/forum?id=-WwaX9vKKt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.72",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "The task of visual question generation~(VQG) aims to generate human-like questions from an image and potentially other side information (e.g. answer type or the answer itself). Despite promising results have been achieved, previous works on VQG either i) suffer from one image to many questions mapping problem rendering the failure of generating referential and meaningful questions from an image, or ii) ignore rich correlations among the visual objects in an image and potential interactions between the side information and image. To address these limitations, we first propose a novel learning paradigm to generate visual questions with answer-awareness and region-reference. In particular, we aim to ask the right visual questions with \\emph{Double Hints - textual answers and visual regions of interests}, effectively mitigating the existing one-to-many mapping issue. To this end, we develop a simple methodology to self-learn the visual hints without introducing any additional human annotations. Furthermore, to capture these sophisticated relationships, we propose a new double-hints guided Graph-to-Sequence learning framework that first models them as a dynamic graph and learns the implicit topology end-to-end, and then utilize a graph-to-sequence model to generate the questions with double hints. Our experiments on VQA2.0 and COCO-QA datasets demonstrate that our proposed model on this new setting can significantly outperform existing state-of-the-art baselines by a large margin. ",
+    "title": "Ask Question with Double Hints:  Visual Question Generation with Answer-awareness and Region-reference",
+    "authors": [
+      "Shen Kai",
+      "Lingfei Wu",
+      "Siliang Tang",
+      "Fangli Xu",
+      "Zhu Zhang",
+      "Yu Qiang",
+      "Yueting Zhuang"
+    ],
+    "emails": [
+      "~Yueting_Zhuang1",
+      "~Zhu_Zhang3",
+      "~Lingfei_Wu1",
+      "~Fangli_Xu2",
+      "~Yu_Qiang1",
+      "~Siliang_Tang1",
+      "~Shen_Kai1"
+    ],
+    "rank": 1123
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ptaz_zIFbX",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      1
+    ],
+    "abstract": "Knowing how the effects of directed actions generalise to new situations (e.g. moving North, South, East and West, or turning left, right, etc.) is key to rapid generalisation across new situations. Markovian tasks can be characterised by a state space and a transition matrix and recent work has proposed that neural grid codes provide an efficient representation of the state space, as eigenvectors of a transition matrix reflecting diffusion across states, that allows efficient prediction of future state distributions. Here we extend the eigenbasis prediction model, utilising tools from Fourier analysis, to prediction over arbitrary translation-invariant directed transition structures (i.e. displacement and diffusion), showing that a single set of eigenvectors can support predictions over arbitrary directed actions via action-specific eigenvalues. We show how to define a \"sense of direction\" to combine actions to reach a target state (ignoring task-specific deviations from translation-invariance), and demonstrate that adding the Fourier representations to a deep Q network aids policy learning in continuous control tasks. We show the equivalence between the generalised prediction framework and traditional models of grid cell firing driven by self-motion to perform path integration, either using oscillatory interference (via Fourier components as velocity-controlled oscillators) or continuous attractor networks (via analysis of the update dynamics). We thus provide a unifying framework for the role of the grid system in predictive planning, sense of direction and path integration: supporting generalisable inference over directed actions across different tasks.",
+    "title": "Prediction and generalisation over directed actions by grid cells",
+    "authors": [
+      "Changmin Yu",
+      "Timothy Behrens",
+      "Neil Burgess"
+    ],
+    "emails": [
+      "fmrib.ox.ac.uk",
+      "ucl.ac.uk",
+      "~Changmin_Yu1"
+    ],
+    "rank": 1124
+  },
+  {
+    "url": "https://openreview.net/forum?id=dpuLRRQ7zC",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Randomized smoothing has achieved state-of-the-art certified robustness against <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-norm adversarial attacks. However, it is not wholly resolved on how to find the optimal base classifier for randomized smoothing. In this work, we employ a Smoothed WEighted ENsembling (SWEEN) scheme to improve the performance of randomized smoothed classifiers. We show the ensembling generality that SWEEN can help achieve optimal certified robustness. Furthermore, theoretical analysis proves that the optimal SWEEN model can be obtained from training under mild assumptions. We also develop an adaptive prediction algorithm to reduce\nthe prediction and certification cost of SWEEN models. Extensive experiments show that SWEEN models outperform the upper envelope of their corresponding candidate models by a large margin. Moreover, SWEEN models constructed using a few small models can achieve comparable performance to a single large model with a notable reduction in training time.",
+    "title": "Enhancing Certified Robustness of Smoothed Classifiers via Weighted Model Ensembling",
+    "authors": [
+      "Chizhou Liu",
+      "Yunzhen Feng",
+      "Ranran Wang",
+      "Bin Dong"
+    ],
+    "emails": [
+      "~Chizhou_Liu1",
+      "~Bin_Dong1",
+      "bibdr.org",
+      "~Yunzhen_Feng1"
+    ],
+    "rank": 1125
+  },
+  {
+    "url": "https://openreview.net/forum?id=0Hj3tFCSjUd",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Retrosynthesis\u2014the process of identifying a set of reactants to synthesize a target molecule\u2014is of vital importance to material design and drug discovery. Existing machine learning approaches based on language models and graph neural networks have achieved encouraging results. However, the inner connections of these models are rarely discussed, and rigorous evaluations of these models are largely in need. In this paper, we propose a framework that unifies sequence- and graph-based methods as energy-based models (EBMs) with different energy functions. This unified point of view establishes connections between different models and identifies the differences between them, thereby promoting the understanding of model design. We also provide a comprehensive assessment of performance to the community. Moreover, we present a novel \u201cdual\u201d variant within the framework that performs consistent training over Bayesian forward- and backward-prediction by constraining the agreement between the two directions. This model improves the state of the art for template-free approaches where the reaction type is unknown and known.\n",
+    "title": "Energy-based View of Retrosynthesis",
+    "authors": [
+      "Ruoxi Sun",
+      "Hanjun Dai",
+      "Li Li",
+      "Steven Kearnes",
+      "Bo Dai"
+    ],
+    "emails": [
+      "~Bo_Dai1",
+      "~Li_Li8",
+      "~Ruoxi_Sun2",
+      "~Steven_Kearnes1",
+      "~Hanjun_Dai1"
+    ],
+    "rank": 1126
+  },
+  {
+    "url": "https://openreview.net/forum?id=_SKUm2AJpvN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In an effort to overcome limitations of reward-driven feature learning in deep reinforcement learning (RL) from images, we propose decoupling representation learning from policy learning.  To this end, we introduce a new unsupervised learning (UL) task, called Augmented Temporal Contrast (ATC), which trains a convolutional encoder to associate pairs of observations separated by a short time difference, under image augmentations and using a contrastive loss.  In online RL experiments, we show that training the encoder exclusively using ATC matches or outperforms end-to-end RL in most environments.  Additionally, we benchmark several leading UL algorithms by pre-training encoders on expert demonstrations and using them, with weights frozen, in RL agents; we find that agents using ATC-trained encoders outperform all others.  We also train multi-task encoders on data from multiple environments and show generalization to different downstream RL tasks.  Finally, we ablate components of ATC, and introduce a new data augmentation to enable replay of (compressed) latent images from pre-trained encoders when RL requires augmentation.  Our experiments span visually diverse RL benchmarks in DeepMind Control, DeepMind Lab, and Atari, and our complete code is available at \\url{hidden url}.",
+    "title": "Decoupling Representation Learning from Reinforcement Learning",
+    "authors": [
+      "Adam Stooke",
+      "Kimin Lee",
+      "Pieter Abbeel",
+      "Michael Laskin"
+    ],
+    "emails": [
+      "~Kimin_Lee1",
+      "~Pieter_Abbeel2",
+      "berkeley.edu",
+      "~Adam_Stooke2"
+    ],
+    "rank": 1127
+  },
+  {
+    "url": "https://openreview.net/forum?id=AMoDLAx6GCC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Generating high quality uncertainty estimates for sequential regression, particularly deep recurrent networks, remains a challenging and open problem.\nExisting approaches often make restrictive assumptions (such as stationarity) yet still perform poorly in practice, particularly in presence of real world non-stationary signals and drift. \nThis paper describes a flexible method that can generate symmetric and asymmetric uncertainty estimates,  makes no assumptions about stationarity, and outperforms competitive baselines on both drift and non drift scenarios.\nThis work helps make sequential regression more effective and practical for use in real-world applications, and is a powerful new addition to the modeling toolbox for sequential uncertainty quantification in general.",
+    "title": "Uncertainty Prediction for Deep Sequential Regression Using Meta Models",
+    "authors": [
+      "Jiri Navratil",
+      "Matthew Arnold",
+      "Benjamin Elder"
+    ],
+    "emails": [
+      "us.ibm.com",
+      "ibm.com",
+      "~Jiri_Navratil1"
+    ],
+    "rank": 1128
+  },
+  {
+    "url": "https://openreview.net/forum?id=zbEupOtJFF",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Invariance to a broad array of image corruptions, such as warping, noise, or color shifts, is an important aspect of building robust models in computer vision. Recently, several new data augmentations have been proposed that significantly improve performance on ImageNet-C, a benchmark of such corruptions. However, there is still a lack of basic understanding on the relationship between data augmentations and test-time corruptions. To this end, we develop a feature space for image transforms, and then use a new measure in this space between augmentations and corruptions called the Minimal Sample Distance to demonstrate there is a strong correlation between similarity and performance. We then investigate recent data augmentations and observe a significant degradation in corruption robustness when the test-time corruptions are sampled to be perceptually dissimilar from ImageNet-C in this feature space. Our results suggest that test error can be improved by training on perceptually similar augmentations, and data augmentations may risk overfitting to the existing benchmark.  We hope our results and tools will allow for more robust progress towards improving robustness to image corruptions.",
+    "title": "On interaction between augmentations and corruptions in natural corruption robustness",
+    "authors": [
+      "Eric Mintun",
+      "Alexander Kirillov",
+      "Saining Xie"
+    ],
+    "emails": [
+      "~Alexander_Kirillov1",
+      "~Eric_Mintun1",
+      "~Saining_Xie2"
+    ],
+    "rank": 1129
+  },
+  {
+    "url": "https://openreview.net/forum?id=_HsKf3YaWpG",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Deep Neural Networks have shown great promise on a variety of downstream applications; but their ability to adapt and generalize to new data and tasks remains a challenging problem. However, the ability to perform few or zero-shot adaptation to novel tasks is important for the scalability and deployment of machine learning models.  It is therefore crucial to understand what makes for good, transferable features in deep networks that best allow for such adaptation.  In this paper, we shed light on this by showing that features that are most transferable have high uniformity in the embedding space and propose a uniformity regularization scheme that encourages better transfer and feature reuse.  We evaluate the regularization on its ability to facilitate adaptation to unseen tasks and data, for which we conduct a thorough experimental study covering four relevant, and distinct domains:  few-shot Meta-Learning, Deep Metric Learning, Zero-Shot Domain Adaptation, as well as Out-of-Distribution classification.   Across all experiments,  we show that uniformity regularization consistently offers benefits over baseline methods and is able to achieve state-of-the-art performance in Deep Metric Learning and Meta-Learning.",
+    "title": "Uniform Priors for Data-Efficient Transfer",
+    "authors": [
+      "Samarth Sinha",
+      "Karsten Roth",
+      "Anirudh Goyal",
+      "Marzyeh Ghassemi",
+      "Hugo Larochelle",
+      "Animesh Garg"
+    ],
+    "emails": [
+      "~Marzyeh_Ghassemi1",
+      "~Karsten_Roth1",
+      "~Hugo_Larochelle1",
+      "~Samarth_Sinha1",
+      "~Anirudh_Goyal1",
+      "~Animesh_Garg1"
+    ],
+    "rank": 1130
+  },
+  {
+    "url": "https://openreview.net/forum?id=PQ2Cel-1rJh",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "How can we efficiently compress a model while maintaining its performance?  Knowledge Distillation (KD) is one of the widely known methods for model compression. In essence, KD trains a smaller student model based on a larger teacher model and tries to retain the teacher model's level of performance as much as possible. However, the existing KD methods suffer from the following limitations. First, since the student model is small in absolute size, it inherently lacks model complexity. Second, the absence of an initial guide for the student model makes it difficult for the student to imitate the teacher model to its fullest. Conventional KD methods yield low performance due to these limitations.\n\nIn this paper, we propose Pea-KD (Parameter-efficient and accurate Knowledge Distillation), a novel approach to KD. Pea-KD consists of two main parts: Shuffled Parameter Sharing (SPS) and Pretraining with Teacher's Predictions (PTP). Using this combination, we are capable of alleviating the KD's limitations. SPS is a new parameter sharing method that allows greater model complexity for the student model. PTP is a KD-specialized initialization method, which can act as a good initial guide for the student. When combined, this method yields a significant increase in student model's performance. Experiments conducted on different datasets and tasks show that the proposed approach improves the student model's performance by 4.4% on average in four GLUE tasks, outperforming existing KD baselines by significant margins.\n",
+    "title": "Pea-KD: Parameter-efficient and accurate Knowledge Distillation",
+    "authors": [
+      "IKHYUN CHO",
+      "U Kang"
+    ],
+    "emails": [
+      "~IKHYUN_CHO1",
+      "~U_Kang1"
+    ],
+    "rank": 1131
+  },
+  {
+    "url": "https://openreview.net/forum?id=i80OPhOCVH2",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Since the proposal of the graph neural network (GNN) by Gori et al. (2005) and Scarselli et al. (2008), one of the major problems in training GNNs was their struggle to propagate information between distant nodes in the graph.\nWe propose a new explanation for this problem: GNNs are susceptible to a bottleneck when aggregating messages across a long path. This bottleneck causes the over-squashing of exponentially growing information into fixed-size vectors.\nAs a result, GNNs fail to propagate messages originating from distant nodes and perform poorly when the prediction task depends on long-range interaction.\nIn this paper, we highlight the inherent problem of over-squashing in GNNs:\nwe demonstrate that the bottleneck hinders popular GNNs from fitting long-range signals in the training data;\nwe further show that GNNs that absorb incoming edges equally, such as GCN and GIN, are more susceptible to over-squashing than GAT and GGNN;\nfinally, we show that prior work, which extensively tuned GNN models of long-range problems, suffers from over-squashing, and that breaking the bottleneck improves their state-of-the-art results without any tuning or additional weights. \nOur code is available at <a href=\"https://github.com/tech-srl/bottleneck/\" target=\"_blank\" rel=\"nofollow\">https://github.com/tech-srl/bottleneck/</a> .",
+    "title": "On the Bottleneck of Graph Neural Networks and its Practical Implications",
+    "authors": [
+      "Uri Alon",
+      "Eran Yahav"
+    ],
+    "emails": [
+      "~Uri_Alon1",
+      "~Eran_Yahav1"
+    ],
+    "rank": 1132
+  },
+  {
+    "url": "https://openreview.net/forum?id=Jr8XGtK04Pw",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7,
+      4
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Can neural networks learn goal-directed behaviour using similar strategies to the brain, by combining the relationships between the current state of the organism and the consequences of future actions? Recent work has shown that recurrent neural networks trained on goal based tasks can develop representations resembling those found in the brain, entorhinal cortex grid cells, for instance. Here we explore the evolution of the dynamics of their internal representations and compare this with experimental data. We observe that once a recurrent network is trained to learn the structure of its environment solely based on sensory prediction, an attractor based landscape forms in the network's representation, which parallels hippocampal place cells in structure and function. Next, we extend the predictive objective to include Q-learning for a reward task, where rewarding actions are dependent on delayed cue modulation. Mirroring experimental findings in hippocampus recordings in rodents performing the same task, this training paradigm causes nonlocal neural activity to sweep forward in space at decision points, anticipating the future path to a rewarded location. Moreover, prevalent choice and cue-selective neurons form in this network, again recapitulating experimental findings. Together, these results indicate that combining predictive, unsupervised learning of the structure of an environment with reinforcement learning can help understand the formation of hippocampus-like representations containing both spatial and task-relevant information.",
+    "title": "Hippocampal representations emerge when training recurrent neural networks on a memory dependent maze navigation task",
+    "authors": [
+      "Justin Jude",
+      "Matthias Hennig"
+    ],
+    "emails": [
+      "~Justin_Jude1",
+      "ed.ac.uk"
+    ],
+    "rank": 1133
+  },
+  {
+    "url": "https://openreview.net/forum?id=l-PrrQrK0QR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "One of the most fundamental aspects of any machine learning algorithm is the training data used by the algorithm. \nWe introduce the novel concept of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-approximation of datasets, obtaining datasets which are much smaller than or are significant corruptions of the original training data while maintaining similar performance. We introduce a meta-learning algorithm Kernel Inducing Points (KIP) for obtaining such remarkable datasets, drawing inspiration from recent developments in the correspondence between infinitely-wide neural networks and kernel ridge-regression (KRR). For KRR tasks, we demonstrate that KIP can compress datasets by one or two orders of magnitude, significantly improving previous dataset distillation and subset selection methods while obtaining state of the art results for MNIST and CIFAR10 classification. Furthermore, our KIP-learned datasets are transferable to the training of finite-width neural networks even beyond the lazy-training regime. Consequently, we obtain state of the art results for neural network dataset distillation with potential applications to privacy-preservation.",
+    "title": "Dataset Meta-Learning from Kernel Ridge-Regression",
+    "authors": [
+      "Timothy Nguyen",
+      "Zhourong Chen",
+      "Jaehoon Lee"
+    ],
+    "emails": [
+      "~Timothy_Nguyen1",
+      "~Jaehoon_Lee2",
+      "~Zhourong_Chen3"
+    ],
+    "rank": 1134
+  },
+  {
+    "url": "https://openreview.net/forum?id=-oeKiM9lD9h",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.71",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we present our recent research about the computational efficiency in convolution. Convolution operation is the most critical component in recent surge of deep learning research. Conventional 2D convolution takes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>C</mi><mrow><mn>2</mn></mrow></msup><msup><mi>K</mi><mrow><mn>2</mn></mrow></msup><mi>H</mi><mi>W</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> to calculate, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>C</mi></math></mjx-assistive-mml></mjx-container> is the channel size, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> is the kernel size, while <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>W</mi></math></mjx-assistive-mml></mjx-container> are the output height and width. Such computation has become really costly considering that these parameters increased over the past few years to meet the needs of demanding applications. Among various implementation of the convolution, separable convolution has been proven to be more efficient in reducing the computational demand. For example, depth separable convolution reduces the complexity to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-msup space=\"3\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>C</mi><mi>H</mi><mi>W</mi><mo>\u22c5</mo><mo stretchy=\"false\">(</mo><mi>C</mi><mo>+</mo><msup><mi>K</mi><mrow><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> while spatial separable convolution reduces the complexity to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>C</mi><mrow><mn>2</mn></mrow></msup><mi>K</mi><mi>H</mi><mi>W</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. However, these are considered an ad hoc design which cannot ensure that they can in general achieve optimal separation. In this research, we propose a novel operator called \\emph{optimal separable convolution} which can be calculated at <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>C</mi><mrow><mfrac><mn>3</mn><mn>2</mn></mfrac></mrow></msup><mi>K</mi><mi>H</mi><mi>W</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> by optimal design for the internal number of groups and kernel sizes for general separable convolutions. When there is no restriction in the number of separated convolutions, an even lower complexity at <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22C5\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>C</mi><mi>H</mi><mi>W</mi><mo>\u22c5</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mi>C</mi><msup><mi>K</mi><mrow><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> can be achieved. Experimental results demonstrate that the proposed optimal separable convolution is able to achieve an improved accuracy-FLOPs and accuracy-#Params trade-offs over both conventional and depth/spatial separable convolutions.",
+    "title": "Rethinking Convolution: Towards an Optimal Efficiency",
+    "authors": [
+      "Tao Wei",
+      "Yonghong Tian",
+      "Chang Wen Chen"
+    ],
+    "emails": [
+      "~Chang_Wen_Chen1",
+      "~Tao_Wei1",
+      "~Yonghong_Tian1"
+    ],
+    "rank": 1135
+  },
+  {
+    "url": "https://openreview.net/forum?id=jPSYH47QSZL",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      7
+    ],
+    "rating": "5.71",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "There has recently been a flurry of exciting advances in deep learning models on point clouds. However, these advances have been hampered by the difficulty of creating labelled point cloud datasets: sparse point clouds often have unclear label identities for certain points, while dense point clouds are time-consuming to annotate. Inspired by mask-based pre-training in the natural language processing community, we propose a pre-training mechanism based point clouds completion. It works by masking occluded points that result from observations at different camera views. It then optimizes a completion model that learns how to reconstruct the occluded points, given the partial point cloud. In this way, our method learns a pre-trained representation that can identify the visual constraints inherently embedded in real-world point clouds. We call our method Occlusion Completion (OcCo). We demonstrate that OcCo learns representations that improve the semantic understandings as well as generalization on downstream tasks over prior methods, transfer to different datasets, reduce training time and improve label efficiency.",
+    "title": "Pre-Training by Completing Point Clouds",
+    "authors": [
+      "Hanchen Wang",
+      "Qi Liu",
+      "Xiangyu Yue",
+      "Joan Lasenby",
+      "Matt Kusner"
+    ],
+    "emails": [
+      "~Qi_Liu5",
+      "~Joan_Lasenby1",
+      "~Xiangyu_Yue1",
+      "~Hanchen_Wang1",
+      "~Matt_Kusner1"
+    ],
+    "rank": 1136
+  },
+  {
+    "url": "https://openreview.net/forum?id=Naqw7EHIfrv",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We propose Deep Autoencoding Predictive Components (DAPC) -- a self-supervised representation learning method for sequence data, based on the intuition that useful representations of sequence data should exhibit a simple structure in the latent space. We encourage this latent structure by maximizing an estimate of \\emph{predictive information} of latent feature sequences, which is the mutual information between the past and future windows at each time step. In contrast to the mutual information lower bound commonly used by contrastive learning, the estimate of predictive information we adopt is exact under a Gaussian assumption. Additionally, it can be computed without negative sampling. To reduce the degeneracy of the latent space extracted by powerful encoders and keep useful information from the inputs, we regularize predictive information learning with a challenging masked reconstruction loss. We demonstrate that our method recovers the latent space of noisy dynamical systems, extracts predictive features for forecasting tasks, and improves automatic speech recognition when used to pretrain the encoder on large amounts of unlabeled data.",
+    "title": "Representation Learning for Sequence Data with Deep Autoencoding Predictive Components",
+    "authors": [
+      "Junwen Bai",
+      "Weiran Wang",
+      "Yingbo Zhou",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Caiming_Xiong1",
+      "~Yingbo_Zhou1",
+      "~Junwen_Bai1",
+      "~Weiran_Wang1"
+    ],
+    "rank": 1137
+  },
+  {
+    "url": "https://openreview.net/forum?id=784_F-WCW46",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "In this paper, we examine the long-neglected yet important effects of point sam- pling patterns in point cloud GANs. Through extensive experiments, we show that sampling-insensitive discriminators (e.g. PointNet-Max) produce shape point clouds with point clustering artifacts while sampling-oversensitive discriminators (e.g. PointNet++, DGCNN, PointConv, KPConv) fail to guide valid shape generation. We propose the concept of sampling spectrum to depict the different sampling sensitivities of discriminators. We further study how different evaluation metrics weigh the sampling pattern against the geometry and propose several perceptual metrics forming a sampling spectrum of metrics. Guided by the proposed sampling spectrum, we discover a middle-point sampling-aware baseline discriminator, PointNet-Mix, which improves all existing point cloud generators by a large margin on sampling-related metrics. We point out that, given that recent research has been focused on the generator design, the discriminator design needs more attention. Our work provides both suggestions and tools for building future discriminators. We will release the code to facilitate future research.",
+    "title": "Rethinking Sampling in 3D Point Cloud Generative Adversarial Networks",
+    "authors": [
+      "He Wang",
+      "Zetian Jiang",
+      "Li Yi",
+      "Kaichun Mo",
+      "Hao Su",
+      "Leonidas Guibas"
+    ],
+    "emails": [
+      "~Li_Yi2",
+      "~Leonidas_Guibas1",
+      "~He_Wang5",
+      "~Kaichun_Mo1",
+      "~Zetian_Jiang1",
+      "~Hao_Su1"
+    ],
+    "rank": 1138
+  },
+  {
+    "url": "https://openreview.net/forum?id=wE-3ly4eT5G",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We propose a simple class of deep reinforcement learning (RL) methods, called FactoredRL, that can leverage factored environment structures to improve the sample efficiency of existing model-based and model-free RL algorithms. In tabular and linear approximation settings, the factored Markov decision process literature has shown exponential improvements in sample efficiency by leveraging factored environment structures. We extend this to deep RL algorithms that use neural networks. For model-based algorithms, we use the factored structure to inform the state transition network architecture and for model-free algorithms we use the factored structure to inform the Q network or the policy network architecture.  We demonstrate that doing this significantly improves sample efficiency in both discrete and continuous state-action space settings.",
+    "title": "FactoredRL: Leveraging Factored Graphs for Deep Reinforcement Learning",
+    "authors": [
+      "Bharathan Balaji",
+      "Petros Christodoulou",
+      "Xiaoyu Lu",
+      "Byungsoo Jeon",
+      "Jordan Bell-Masterson"
+    ],
+    "emails": [
+      "cs.cmu.edu",
+      "~Bharathan_Balaji1",
+      "~Xiaoyu_Lu1",
+      "~Petros_Christodoulou1",
+      "~Jordan_Bell-Masterson1"
+    ],
+    "rank": 1139
+  },
+  {
+    "url": "https://openreview.net/forum?id=30I4Azqc_oP",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Reinforcement Learning (RL) has shown great potential to deal with sequential decision-making problems. However, most RL algorithms do not explicitly consider the relations between entities in the environment. This makes the policy learning suffer from the problems of efficiency, effectivity and interpretability.  In this paper, we propose a novel deep reinforcement learning algorithm, which firstly learns the causal structure of the environment and then leverages the learned causal information to assist policy learning. The proposed algorithm learns a graph to encode the environmental structure by calculating Average Causal Effect (ACE) between different categories of entities, and an intrinsic reward is given to encourage the agent to interact more with entities belonging to top-ranked categories, which significantly boosts policy learning.  Several experiments are conducted on a number of simulation environments to demonstrate the effectiveness and better interpretability of our proposed method.",
+    "title": "Deep Reinforcement Learning with Causality-based Intrinsic Reward",
+    "authors": [
+      "Peng Zhang",
+      "Furui Liu",
+      "Zhitang Chen",
+      "Jianye HAO",
+      "Jun Wang"
+    ],
+    "emails": [
+      "~Peng_Zhang20",
+      "~Jianye_HAO1",
+      "huawei.com",
+      "~Jun_Wang2",
+      "~Zhitang_Chen1"
+    ],
+    "rank": 1140
+  },
+  {
+    "url": "https://openreview.net/forum?id=apiI1ySCSSR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6
+    ],
+    "rating": "5.70",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Recent works found that fine-tuning and joint training---two popular approaches for transfer learning---do not always improve accuracy on downstream tasks. First, we aim to understand more about when and why fine-tuning and joint training can be suboptimal or even harmful for transfer learning. We design semi-synthetic datasets where the source task can be solved by either source-specific features or transferable features. We observe that (1) pre-training may not have incentive to learn transferable features and (2) joint training may simultaneously learn source-specific features and  overfit to the target. Second, to improve over fine-tuning and joint training, we propose Meta Representation Learning MeRLin to learn transferable features. MeRLin meta-learns representations by ensuring that a head fit on top of the representations with target training data also performs well on target validation data. We also prove that MeRLin recovers the target ground-truth model with a quadratic neural net parameterization and a source distribution that contains both transferable and source-specific features. On the same distribution,  pre-training and joint training provably fail to learn transferable features. MeRLin empirically outperforms previous state-of-the-art transfer learning algorithms  on various real-world vision and NLP transfer learning benchmarks. ",
+    "title": "Meta-learning Transferable Representations with a Single Target Domain",
+    "authors": [
+      "Hong Liu",
+      "Jeff Z. HaoChen",
+      "Colin Wei",
+      "Tengyu Ma"
+    ],
+    "emails": [
+      "~Tengyu_Ma1",
+      "~Hong_Liu5",
+      "~Colin_Wei1",
+      "~Jeff_Z._HaoChen1"
+    ],
+    "rank": 1141
+  },
+  {
+    "url": "https://openreview.net/forum?id=ot9bYHvuULl",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4
+    ],
+    "rating": "5.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "While theoretically appealing, the application of the Wasserstein distance to large-scale machine learning problems has been hampered by its prohibitive computational cost. The sliced Wasserstein distance and its variants improve the computational efficiency through random projection, yet they suffer from low projection efficiency because the majority of projections result in trivially small values. In this work, we propose a new family of distance metrics, called augmented sliced Wasserstein distances (ASWDs), constructed by first mapping samples to higher-dimensional hypersurfaces parameterized by neural networks. It is derived from a key observation that (random) linear projections of samples residing on these hypersurfaces would translate to much more flexible nonlinear projections in the original sample space, so they can capture complex structures of the data distribution. We show that the hypersurfaces can be optimized by gradient ascent efficiently. We provide the condition under which the ASWD is a valid metric and show that this can be obtained by an injective neural network architecture. Numerical results demonstrate that the ASWD significantly outperforms other Wasserstein variants for both synthetic and real-world problems.",
+    "title": "Augmented Sliced Wasserstein Distances",
+    "authors": [
+      "Xiongjie Chen",
+      "Yongxin Yang",
+      "Yunpeng Li"
+    ],
+    "emails": [
+      "~Xiongjie_Chen1",
+      "~Yongxin_Yang1",
+      "~Yunpeng_Li1"
+    ],
+    "rank": 1142
+  },
+  {
+    "url": "https://openreview.net/forum?id=0PtUPB9z6qK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We introduce the Generalized Energy Based Model (GEBM) for generative modelling. These models combine two  trained components: a base distribution (generally an implicit model), which can learn the support of data with low intrinsic dimension in a high dimensional space; and an energy function, to refine the probability mass on the learned support. \nBoth the energy function and base jointly constitute the final model, unlike GANs, which retain only the base distribution (the \"generator\").  \nGEBMs are trained by alternating between learning the energy and the base. \nWe show that both training stages are well-defined: the energy is learned by maximising a generalized likelihood, and the resulting energy-based loss provides informative gradients for learning the base.\nSamples from the posterior on the latent space of the trained model can be obtained via MCMC, thus finding regions in this space that produce better quality samples.\nEmpirically, the GEBM samples on image-generation tasks are of much better quality than those from the learned generator alone, indicating that all else being equal, the GEBM will outperform a GAN of the same complexity. When using normalizing flows as base measures, GEBMs succeed on density modelling tasks returning comparable performance to direct maximum likelihood of the same networks.",
+    "title": "Generalized Energy Based Models",
+    "authors": [
+      "Michael Arbel",
+      "Liang Zhou",
+      "Arthur Gretton"
+    ],
+    "emails": [
+      "ucl.ac.uk",
+      "~Arthur_Gretton1",
+      "~Michael_Arbel1"
+    ],
+    "rank": 1143
+  },
+  {
+    "url": "https://openreview.net/forum?id=JdCUjf9xvlc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.70",
+    "confidences": [
+      3,
+      1,
+      3,
+      3
+    ],
+    "abstract": "Optimization of real-world black-box functions defined over purely categorical variables is an active area of research. In particular, optimization and design of biological sequences with specific functional or structural properties have a profound impact in medicine, materials science, and biotechnology. Standalone acquisition methods, such as simulated annealing (SA) and Monte Carlo tree search (MCTS), are typically used for such optimization problems. In order to improve the performance and sample efficiency of such acquisition methods, we propose to use existing acquisition methods in conjunction with a surrogate model for the black-box evaluations over purely categorical variables. To this end, we present two different representations, a group-theoretic Fourier expansion and an abridged one-hot encoded Boolean Fourier expansion. To learn such models, characters of each representation are considered as experts and their respective coefficients are updated via an exponential weight update rule each time the black box is evaluated. Numerical experiments over synthetic benchmarks as well as real-world RNA sequence optimization and design problems demonstrate the representational power of the proposed methods, which achieve competitive or superior performance compared to state-of-the-art counterparts, while improving the computational cost and/or sample efficiency substantially.",
+    "title": "Fourier Representations for Black-Box Optimization over Categorical Variables",
+    "authors": [
+      "Hamid Dadkhahi",
+      "Jesus Rios",
+      "Karthikeyan Shanmugam",
+      "Payel Das"
+    ],
+    "emails": [
+      "~Karthikeyan_Shanmugam1",
+      "~Payel_Das1",
+      "~Hamid_Dadkhahi1",
+      "us.ibm.com"
+    ],
+    "rank": 1144
+  },
+  {
+    "url": "https://openreview.net/forum?id=eyDDGPt5R1S",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.70",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "How can we perform posterior inference for deep latent variable models in an efficient and flexible manner? Markov chain Monte Carlo (MCMC) methods, such as Langevin dynamics, provide sample approximations of such posteriors with an asymptotic convergence guarantee. However, it is difficult to apply these methods to large-scale datasets owing to their slow convergence and datapoint-wise iterations. In this study, we propose amortized Langevin dynamics, wherein datapoint-wise MCMC iterations are replaced with updates of an inference model that maps observations into latent variables. The amortization enables scalable inference from large-scale datasets. Developing a latent variable model and an inference model with neural networks, yields Langevin autoencoders (LAEs), a novel Langevin-based framework for deep generative models. Moreover, if we define a latent prior distribution with an unnormalized energy function for more flexible generative modeling, LAEs are extended to a more general framework, which we refer to as contrastive Langevin autoencoders (CLAEs). We experimentally show that LAEs and CLAEs can generate sharp image samples. Moreover, we report their performance of unsupervised anomaly detection.",
+    "title": "Learning Deep Latent Variable Models via Amortized Langevin Dynamics",
+    "authors": [
+      "Shohei Taniguchi",
+      "Yusuke Iwasawa",
+      "Yutaka Matsuo"
+    ],
+    "emails": [
+      "~Yutaka_Matsuo1",
+      "~Yusuke_Iwasawa1",
+      "~Shohei_Taniguchi1"
+    ],
+    "rank": 1145
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y3pk2JxYmO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recently, representations learned by self-supervised approaches have significantly reduced the gap with their supervised counterparts in many different computer vision tasks. However, these self-supervised methods are computationally challenging. In this work, we focus on accelerating contrastive learning algorithms with little or even no loss of accuracy. Our insight is that, contrastive learning concentrates on optimizing similarity (dissimilarity) between pairs of inputs, and the similarity on the intermediate layers is a good surrogate of the final similarity. We exploit our observation by introducing additional intermediate contrastive losses. In this way, we can truncate the back-propagation and updates only a part of the parameters for each gradient descent update. Additionally, we do selection based on the intermediate losses to filter easy regions for each image, which further reduces the computational cost. We apply our method to recently-proposed MOCO, SimCLR, SwAV and notice that we can reduce the computational cost with little loss on the performance of ImageNet linear classification and other downstream tasks.",
+    "title": "Fast Training of Contrastive Learning with Intermediate Contrastive Loss",
+    "authors": [
+      "Chengyue Gong",
+      "Xingchao Liu",
+      "qiang liu"
+    ],
+    "emails": [
+      "~qiang_liu4",
+      "~Xingchao_Liu1",
+      "~Chengyue_Gong1"
+    ],
+    "rank": 1146
+  },
+  {
+    "url": "https://openreview.net/forum?id=3EM0a2wC-jo",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "When an agent interacts with a complex environment, it receives a stream of percepts in which it may detect entities, such as objects or people. To build up a coherent, low-variance estimate of the underlying state, it is necessary to fuse information from multiple detections over time. To do this fusion, the agent must decide which detections to associate with one another. We address this data-association problem in the setting of an online filter, in which each observation is processed by aggregating into an existing object hypothesis. Classic methods with strong probabilistic foundations exist, but they are computationally expensive and require models that can be difficult to acquire. In this work, we use the deep-learning tools of sparse attention and representation learning to learn a machine that processes a stream of detections and outputs a set of hypotheses about objects in the world. We evaluate this approach on simple clustering problems, problems with dynamics, and a complex image-based domain. We find that it generalizes well from short to long observation sequences and from a few to many hypotheses, outperforming other learning approaches and classical non-learning methods.",
+    "title": "Learning Online Data Association",
+    "authors": [
+      "Yilun Du",
+      "Joshua B. Tenenbaum",
+      "Tomas Perez",
+      "Leslie Pack Kaelbling"
+    ],
+    "emails": [
+      "~Tomas_Perez1",
+      "~Joshua_B._Tenenbaum1",
+      "~Leslie_Pack_Kaelbling1",
+      "~Yilun_Du1"
+    ],
+    "rank": 1147
+  },
+  {
+    "url": "https://openreview.net/forum?id=1EVb8XRBDNr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Centralized training with decentralized execution (CTDE) has become an important paradigm in multi-agent reinforcement learning (MARL). Current CTDE-based methods rely on restrictive decompositions of the centralized value function across agents, which decomposes the global Q-value into individual Q values to guide individuals' behaviours. However, such expected, i.e., risk-neutral, Q value decomposition is not sufficient even with CTDE due to the randomness of rewards and the uncertainty in environments, which causes the failure of these methods to train coordinating agents in complex environments. To address these issues, we propose RMIX, a novel cooperative MARL method with the Conditional Value at Risk (CVaR) measure over the learned distributions of individuals' Q values. Our main contributions are in three folds: (i) We first learn the return distributions of individuals to analytically calculate CVaR for decentralized execution; (ii) We then propose a dynamic risk level predictor for CVaR calculation to handle the temporal nature of the stochastic outcomes during executions; (iii) We finally propose risk-sensitive Bellman equation along with Individual-Global-MAX (IGM) for MARL training. Empirically, we show that our method significantly outperforms state-of-the-art methods on many challenging StarCraft II tasks, demonstrating significantly enhanced coordination and high sample efficiency.",
+    "title": "RMIX: Risk-Sensitive Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Wei Qiu",
+      "Xinrun Wang",
+      "Runsheng Yu",
+      "Xu He",
+      "Rundong Wang",
+      "Bo An",
+      "Svetlana Obraztsova",
+      "Zinovi Rabinovich"
+    ],
+    "emails": [
+      "~Rundong_Wang1",
+      "e.ntu.edu.sg",
+      "~Zinovi_Rabinovich1",
+      "~Wei_Qiu3",
+      "~Bo_An2",
+      "gmail.com",
+      "~Xinrun_Wang1",
+      "~Svetlana_Obraztsova1"
+    ],
+    "rank": 1148
+  },
+  {
+    "url": "https://openreview.net/forum?id=yOkSW62hqq2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "One effective way to ease the deployment of deep neural networks on resource constrained devices is Knowledge Distillation (KD), which boosts the accuracy of a low-capacity student model by mimicking the learnt information of a high-capacity teacher (either a single model or a multi-model ensemble). Although great progress has been attained on KD research, existing efforts are primarily invested to design better distillation losses by using soft logits or intermediate feature representations of the teacher as the extra supervision. In this paper, we present Explicit Connection Distillation (ECD), a new KD framework, which addresses the knowledge distillation problem in a novel perspective of bridging dense intermediate feature connections between a student network and its corresponding teacher generated automatically in the training, achieving knowledge transfer goal via direct cross-network layer-to-layer gradients propagation. ECD has two interdependent modules. In the first module, given a student network, an auxiliary teacher architecture is temporarily generated conditioned on strengthening feature representations of basic convolutions of the student network via replacing them with dynamic additive convolutions and keeping the other layers unchanged in structure. The teacher generated in this way guarantees its superior capacity and makes a perfect feature alignment (both in input and output dimensions) to the student at every convolutional layer. In the second module, dense feature connections between the aligned convolutional layers from the student to its auxiliary teacher are introduced, which allows explicit layer-to-layer gradients propagation from the teacher to the student via the merged model training from scratch. Intriguingly, as feature connection direction is one-way, all feature connections together with the auxiliary teacher merely exist during training phase. Experiments on popular image classification tasks validate the effectiveness of our method. Code will be made publicly available.",
+    "title": "Explicit Connection Distillation",
+    "authors": [
+      "Lujun Li",
+      "Yikai Wang",
+      "Anbang Yao",
+      "Yi Qian",
+      "Xiao Zhou",
+      "Ke He"
+    ],
+    "emails": [
+      "~Xiao_Zhou3",
+      "~Ke_He1",
+      "~Yi_Qian2",
+      "~Lujun_Li1",
+      "~Anbang_Yao1",
+      "~Yikai_Wang2"
+    ],
+    "rank": 1149
+  },
+  {
+    "url": "https://openreview.net/forum?id=nkap3LV7t7O",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Variational autoencoders (VAEs) provide an effective and simple method for modeling complex distributions. However, training VAEs often requires considerable hyperparameter tuning to determine the optimal amount of information retained by the latent variable. We study the impact of calibrated decoders, which learn the uncertainty of the decoding distribution and can determine this amount of information automatically, on the VAE performance. While many methods for learning calibrated decoders have been proposed, many of the recent papers that employ VAEs rely on heuristic hyperparameters and ad-hoc modifications instead. We perform the first comprehensive comparative analysis of calibrated decoder and provide recommendations for simple and effective VAE training. Our analysis covers a range of datasets and several single-image and sequential VAE models. We further propose a simple but novel modification to the commonly used Gaussian decoder, which computes the prediction variance analytically. We observe empirically that using heuristic modifications is not necessary with our method.",
+    "title": "Simple and Effective VAE Training with Calibrated Decoders",
+    "authors": [
+      "Oleh Rybkin",
+      "Kostas Daniilidis",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Oleh_Rybkin1",
+      "~Sergey_Levine1",
+      "~Kostas_Daniilidis1"
+    ],
+    "rank": 1150
+  },
+  {
+    "url": "https://openreview.net/forum?id=pHsHaXAv8m-",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.69",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Knowledge graph (KG) representation learning for entity alignment has recently received great attention. Compared with conventional methods, these embedding-based ones are considered to be robuster for highly-heterogeneous and cross-lingual entity alignment scenarios as they do not rely on the quality of  machine translation or feature extraction. Despite the significant improvement that has been made, there is little understanding of how the embedding-based entity alignment methods actually work. Most existing methods rest on the foundation that a small number of pre-aligned entities can serve as anchors to connect the embedding spaces of two KGs. But no one investigates the rationality of such foundation. In this paper, we define a typical paradigm abstracted from the existing methods, and analyze how the representation discrepancy between two potentially-aligned entities is implicitly bounded by a predefined margin in the scoring function for embedding learning. However, such a margin cannot guarantee to be tight enough for alignment learning. We mitigate this problem by proposing a new approach that explicitly learns KG-invariant and principled entity representations, meanwhile preserves the original infrastructure of existing methods. In this sense, the model not only pursues the closeness of aligned entities on geometric distance, but also aligns the neural ontologies of two KGs to eliminate the discrepancy in feature distribution and underlying ontology knowledge. Our experiments demonstrate consistent and significant improvement in performance against the existing embedding-based entity alignment methods, including several state-of-the-art ones.",
+    "title": "Towards Principled Representation Learning for Entity Alignment",
+    "authors": [
+      "Lingbing Guo",
+      "Zequn Sun",
+      "Mingyang Chen",
+      "Wei Hu",
+      "Huajun Chen"
+    ],
+    "emails": [
+      "nju.edu.cn",
+      "~Huajun_Chen1",
+      "zju.edu.cn",
+      "~Lingbing_Guo1",
+      "~Zequn_Sun1"
+    ],
+    "rank": 1151
+  },
+  {
+    "url": "https://openreview.net/forum?id=InGI-IMDL18",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      2,
+      6,
+      7
+    ],
+    "rating": "5.69",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We consider the problem of training User Verification (UV) models in federated setup, where the conventional loss functions are not applicable due to the constraints that each user has access to the data of only one class and user embeddings cannot be shared with the server or other users.  To address this problem, we propose Federated User Verification (FedUV), a framework for private and secure training of UV models. In FedUV, users jointly learn a set of vectors and maximize the  correlation  of  their  instance  embeddings  with  a  secret  user-defined  linear combination of those vectors. We show that choosing the linear combinations from the codewords of an error-correcting code allows users to collaboratively train the model without revealing their embedding vectors.  We present the experimental results for user verification with voice, face, and handwriting data and show that FedUV is on par with existing approaches, while not sharing the embeddings with other users or the server.",
+    "title": "Secure Federated Learning of User Verification Models",
+    "authors": [
+      "Hossein Hosseini",
+      "Hyunsin Park",
+      "Sungrack Yun",
+      "Christos Louizos",
+      "Joseph Soriaga",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Hossein_Hosseini4",
+      "~Sungrack_Yun1",
+      "qti.qualcomm.com",
+      "~Christos_Louizos1"
+    ],
+    "rank": 1152
+  },
+  {
+    "url": "https://openreview.net/forum?id=1s1T7xHc5l6",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Steerable CNN imposes the prior knowledge of transformation invariance or equivariance in the network architecture to enhance the the network robustness on geometry transformation of data and reduce overfitting. Filter transform has been an intuitive and widely used technique to construct steerable CNN in the past decades. Recently, group representation theory is used to analyze steerable CNN and reveals the function space structure of a steerable kernel function. However, it is not yet clear on how this theory is related to the filter transform technique. In this paper, we show that kernel constructed by filter transform can also be interpreted in the group representation theory. Meanwhile, we show that filter transformed kernels can be used to convolve input/output features in different group representation. This interpretation help complete the puzzle of steerable CNN theory and provides a novel and simple approach to implement steerable convolution operators. Experiments are executed on multiple datasets to verify the feasibilty of the proposed approach.\n",
+    "title": "FILTRA: Rethinking Steerable CNN by Filter Transform",
+    "authors": [
+      "Bo Li",
+      "Qili Wang",
+      "Gim Hee Lee"
+    ],
+    "emails": [
+      "~Qili_Wang1",
+      "~Gim_Hee_Lee1",
+      "~Bo_Li16"
+    ],
+    "rank": 1153
+  },
+  {
+    "url": "https://openreview.net/forum?id=yT7-k6Q6gda",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The early phase of training has been shown to be important in two ways for deep neural networks. First, the degree of regularization in this phase significantly impacts the final generalization. Second, it is accompanied by a rapid change in the local loss curvature influenced by regularization choices. Connecting these two findings, we show that stochastic gradient descent (SGD) implicitly penalizes the trace of the Fisher Information Matrix (FIM) from the beginning of training. We argue it is an implicit regularizer in SGD by showing that explicitly penalizing the trace of the FIM can significantly improve generalization. We further show that the early value of the trace of the FIM correlates strongly with the final generalization. We highlight that in the absence of implicit or explicit regularization, the trace of the FIM can increase to a large value early in training, to which we refer as catastrophic Fisher explosion. Finally, to gain insight into the regularization effect of penalizing the trace of the FIM, we show that it limits memorization by reducing the learning speed of examples with noisy labels more than that of the clean examples, and 2) trajectories with a low initial trace of the FIM end in flat minima, which are commonly associated with good generalization.",
+    "title": "Catastrophic Fisher Explosion: Early Phase Fisher Matrix Impacts Generalization",
+    "authors": [
+      "Stanislaw Kamil Jastrzebski",
+      "Devansh Arpit",
+      "Oliver \u00c5strand",
+      "Giancarlo Kerg",
+      "Huan Wang",
+      "Caiming Xiong",
+      "richard socher",
+      "Kyunghyun Cho",
+      "Krzysztof J. Geras"
+    ],
+    "emails": [
+      "~Devansh_Arpit2",
+      "~Oliver_\u00c5strand1",
+      "~Kyunghyun_Cho1",
+      "~Huan_Wang1",
+      "~Giancarlo_Kerg1",
+      "~Krzysztof_J._Geras1",
+      "~Stanislaw_Kamil_Jastrzebski1",
+      "~Caiming_Xiong1",
+      "~richard_socher1"
+    ],
+    "rank": 1154
+  },
+  {
+    "url": "https://openreview.net/forum?id=vcopnwZ7bC",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      4,
+      2,
+      3
+    ],
+    "abstract": "Many complex real-world tasks are composed of several levels of subtasks. Humans leverage these hierarchical structures to accelerate the learning process and achieve better generalization. In this work, we study the inductive bias and propose Ordered Memory Policy Network (OMPN) to discover subtask hierarchy by learning from demonstration. The discovered subtask hierarchy could be used to perform task decomposition, recovering the subtask boundaries in an unstructured demonstration. Experiments on Craft and Dial demonstrate that our model can achieve higher task decomposition performance under both unsupervised and weakly supervised settings, comparing with strong baselines. OMPN can also be directly applied to partially observable environments and still achieve higher task decomposition performance. Our visualization further confirms that the subtask hierarchy can emerge in our model 1.",
+    "title": "Learning Task Decomposition with Ordered Memory Policy Network",
+    "authors": [
+      "Yuchen Lu",
+      "Yikang Shen",
+      "Siyuan Zhou",
+      "Aaron Courville",
+      "Joshua B. Tenenbaum",
+      "Chuang Gan"
+    ],
+    "emails": [
+      "~Joshua_B._Tenenbaum1",
+      "~Yikang_Shen1",
+      "~Siyuan_Zhou2",
+      "~Chuang_Gan1",
+      "~Yuchen_Lu1",
+      "~Aaron_Courville3"
+    ],
+    "rank": 1155
+  },
+  {
+    "url": "https://openreview.net/forum?id=bW9SYKHcZiz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Inspired by the great success of BERT in NLP tasks, many text-vision BERT models emerged recently. Benefited from cross-modal attention,  text-vision BERT models have achieved excellent performance in many language-vision tasks including text-image retrieval.   Nevertheless,  cross-modal attentions used in text-vision BERT models require too expensive computation cost when solving text-vision retrieval, which is impractical for large-scale search. In this work, we develop a novel architecture, Cross-Probe BERT.  It relies on devised text and vision probes, and cross-modal attentions are conducted on text and vision probes.  It takes lightweight computation cost, and meanwhile effectively exploits cross-modal attention.  Systematic experiments conducted on two public benchmarks demonstrate state-of-the-art effectiveness and efficiency of the proposed method.",
+    "title": "Cross-Probe BERT for Efficient and Effective Cross-Modal Search",
+    "authors": [
+      "TAN YU",
+      "Hongliang Fei",
+      "Ping Li"
+    ],
+    "emails": [
+      "~TAN_YU2",
+      "~Ping_Li3",
+      "~Hongliang_Fei2"
+    ],
+    "rank": 1156
+  },
+  {
+    "url": "https://openreview.net/forum?id=FP9kKyNWwwE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      7,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Zero-shot hyper-parameter optimization refers to the process of selecting hyper- parameter configurations that are expected to perform well for a given dataset upfront, without access to any observations of the losses of the target response. Existing zero-shot approaches are posed as initialization strategies for Bayesian Optimization and they often rely on engineered meta-features to measure dataset similarity, operating under the assumption that the responses of similar datasets behaves similarly with respect to the same hyper-parameters. Solutions for zero- shot HPO are embarrassingly parallelizable and thus can reduce vastly the required wallclock time of learning a single model. We propose a very simple HPO model called Gray-box Zero(0)-Shot Initialization (GROSI) as a conditional parametric surrogate that learns a universal response model by exploiting the relationship between the hyper-parameters and the dataset meta-features directly. In contrast to existing HPO solutions, we achieve transfer of knowledge without engineered meta- features, but rather through a shared model that is trained simultaneously across all datasets. We design and optimize a novel loss function that allows us to regress from the dataset/hyper-parameter pair unto the response. Experiments on 120 datasets demonstrate the strong performance of GROSI, compared to conventional initialization strategies. We also show that by fine-tuning GROSI to the target dataset, we can outperform state-of-the-art sequential HPO algorithms.",
+    "title": "Zero-shot Transfer Learning for Gray-box Hyper-parameter Optimization",
+    "authors": [
+      "Hadi Samer Jomaa",
+      "Lars Schmidt-Thieme",
+      "Josif Grabocka"
+    ],
+    "emails": [
+      "~Lars_Schmidt-Thieme1",
+      "~Josif_Grabocka1",
+      "~Hadi_Samer_Jomaa1"
+    ],
+    "rank": 1157
+  },
+  {
+    "url": "https://openreview.net/forum?id=NTP9OdaT6nm",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      5,
+      5,
+      3,
+      3
+    ],
+    "abstract": "In order to satisfy safety conditions, an agent may be constrained from acting freely. A safe controller can be designed a priori if an environment is well understood, but not when learning is employed. In particular, reinforcement learned (RL) controllers require exploration, which can be hazardous in safety critical situations. We study the benefits of giving structure to the constraints of a constrained Markov decision process by specifying them in formal languages as a step towards using safety methods from software engineering and controller synthesis. We instantiate these constraints as finite automata to efficiently recognise constraint violations. Constraint states are then used to augment the underlying MDP state and to learn a dense cost function, easing the problem of quickly learning joint MDP/constraint dynamics. We empirically evaluate the effect of these methods on training a variety of RL algorithms over several constraints specified in Safety Gym, MuJoCo, and Atari environments.",
+    "title": "Formal Language Constrained Markov Decision Processes",
+    "authors": [
+      "Eleanor Quint",
+      "Dong Xu",
+      "Samuel W Flint",
+      "Stephen D Scott",
+      "Matthew Dwyer"
+    ],
+    "emails": [
+      "~Matthew_Dwyer1",
+      "~Dong_Xu1",
+      "~Eleanor_Quint1",
+      "~Samuel_W_Flint1",
+      "~Stephen_D_Scott1"
+    ],
+    "rank": 1158
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZHkbzSR56jA",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.69",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Distributed learning has become a hot research topic due to its wide application in cluster-based large-scale learning, federated learning, edge computing and so on. Most traditional distributed learning methods typically assume no failure or attack on workers. However, many unexpected cases, such as communication failure and even malicious attack, may happen in real applications. Hence, Byzantine learning (BL), which refers to distributed learning with failure or attack, has recently attracted much attention. Most existing BL methods are synchronous, which are impractical in some applications due to heterogeneous or offline workers. In these cases, asynchronous BL (ABL) is usually preferred. In this paper, we propose a novel method, called buffered asynchronous stochastic gradient descent (BASGD), for ABL. To the best of our knowledge, BASGD is the first ABL method that can resist malicious attack without storing any instances on server. Compared with those methods which need to store instances on server, BASGD takes less risk of privacy leakage. BASGD is proved to be convergent, and be able to resist failure or attack. Empirical results show that BASGD significantly outperforms vanilla ASGD and other ABL baselines when there exists failure or attack on workers.",
+    "title": "BASGD: Buffered Asynchronous SGD for Byzantine Learning",
+    "authors": [
+      "Yi-Rui Yang",
+      "Wu-Jun Li"
+    ],
+    "emails": [
+      "~Wu-Jun_Li1",
+      "~Yi-Rui_Yang2"
+    ],
+    "rank": 1159
+  },
+  {
+    "url": "https://openreview.net/forum?id=5K8ZG9twKY",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.69",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "A dramatic improvement in data collection technologies has aided in procuring massive amounts of unstructured and heterogeneous datasets. This has consequently led to a prevalence of heavy-tailed distributions across a broad range of tasks in machine learning. In this work, we perform thorough empirical studies to show that modern machine learning models such as generative adversarial networks and invertible flow models are plagued with such ill-behaved distributions during the phase of training them. To alleviate this problem, we develop a computationally-efficient estimator for mean estimation with provable guarantees which can handle such ill-behaved distributions. We provide specific consequences of our theory for supervised learning tasks such as linear regression and generalized linear models. Furthermore, we study the performance of our algorithm on synthetic tasks and real-world experiments and show that our methods convincingly outperform a variety of practical baselines.",
+    "title": "Efficient Estimators for Heavy-Tailed Machine Learning",
+    "authors": [
+      "Vishwak Srinivasan",
+      "Adarsh Prasad",
+      "Sivaraman Balakrishnan",
+      "Pradeep Kumar Ravikumar"
+    ],
+    "emails": [
+      "~Adarsh_Prasad1",
+      "~Sivaraman_Balakrishnan1",
+      "~Vishwak_Srinivasan1",
+      "~Pradeep_Kumar_Ravikumar1"
+    ],
+    "rank": 1160
+  },
+  {
+    "url": "https://openreview.net/forum?id=5L8XMh667qz",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "While variational autoencoders have been successful in a variety of tasks, the use of conventional Gaussian or Gaussian mixture priors are limited in their ability to encode underlying structure of data in the latent representation.\nIn this work, we introduce an Encoded Prior Sliced Wasserstein AutoEncoder (EPSWAE) wherein an additional prior-encoder network facilitates learns an embedding of the data manifold which preserves topological and geometric properties of the data, thus improving the structure of latent space.\nThe autoencoder and prior-encoder networks are iteratively trained using the Sliced Wasserstein (SW) distance, which efficiently measures the distance between two \\textit{arbitrary} sampleable distributions without being constrained to a specific form as in the KL divergence, and without requiring expensive adversarial training.\nTo improve the representation, we use (1) a structural consistency term in the loss that encourages isometry between feature space and latent space and (2) a nonlinear variant of the SW distance which averages over random nonlinear shearing.\nThe effectiveness of the learned manifold encoding is best explored by traversing the latent space through interpolations along \\textit{geodesics} which generate samples that lie on the manifold and hence are advantageous compared to standard Euclidean interpolation.\nTo this end, we introduce a graph-based algorithm for interpolating along network-geodesics in latent space by maximizing the density of samples along the path while minimizing total energy. We use the 3D-spiral data to show that the prior does indeed encode the geometry underlying the data and to demonstrate the advantages of the network-algorithm for interpolation.\nAdditionally, we apply our framework to MNIST, and CelebA datasets, and show that outlier generations, latent representations, and geodesic interpolations are comparable to the state of the art.",
+    "title": "Encoded Prior Sliced Wasserstein AutoEncoder for learning latent manifold representations",
+    "authors": [
+      "Sanjukta Krishnagopal",
+      "Jacob Bedrossian"
+    ],
+    "emails": [
+      "~Sanjukta_Krishnagopal1",
+      "math.umd.edu"
+    ],
+    "rank": 1161
+  },
+  {
+    "url": "https://openreview.net/forum?id=8xLkv08d70T",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We introduce Adaptive Procedural Task Generation (APT-Gen), an approach to progressively generate a sequence of tasks as curricula to facilitate reinforcement learning in hard-exploration problems. At the heart of our approach, a task generator learns to create tasks from a parameterized task space via a black-box procedural generation module. To enable curriculum learning in the absence of a direct indicator of learning progress, we propose to train the task generator by balancing the agent's performance in the generated tasks and the similarity to the target tasks. Through adversarial training, the task similarity is adaptively estimated by a task discriminator defined on the agent's experiences, allowing the generated tasks to approximate target tasks of unknown parameterization or outside of the predefined task space. Our experiments on the grid world and robotic manipulation task domains show that APT-Gen achieves substantially better performance than various existing baselines by generating suitable tasks of rich variations.",
+    "title": "Adaptive Procedural Task Generation for Hard-Exploration Problems",
+    "authors": [
+      "Kuan Fang",
+      "Yuke Zhu",
+      "Silvio Savarese",
+      "Fei-Fei Li"
+    ],
+    "emails": [
+      "~Silvio_Savarese1",
+      "~Yuke_Zhu1",
+      "~Kuan_Fang3",
+      "~Fei-Fei_Li1"
+    ],
+    "rank": 1162
+  },
+  {
+    "url": "https://openreview.net/forum?id=ARFshOO1Iu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "Neural sequence labeling is an important technique employed for many Natural Language Processing (NLP) tasks, such as Named Entity Recognition (NER), slot tagging for dialog systems and semantic parsing. Large-scale pre-trained language models obtain very good performance on these tasks when fine-tuned on large amounts of task-specific labeled data. However, such large-scale labeled datasets are difficult to obtain for several tasks and domains due to the high cost of human annotation as well as privacy and data access constraints for sensitive user applications. This is exacerbated for sequence labeling tasks requiring such annotations at token-level. In this work, we develop techniques to address the label scarcity challenge for neural sequence labeling models. Specifically, we develop self-training and meta-learning techniques for training neural sequence taggers with few labels. While self-training serves as an effective mechanism to learn from large amounts of unlabeled data -- meta-learning helps in adaptive sample re-weighting to mitigate error propagation from noisy pseudo-labels. Extensive experiments on six benchmark datasets including two for massive multilingual NER and four slot tagging datasets for task-oriented dialog systems demonstrate the effectiveness of our method. With only 10 labeled examples for each class for each task, our method obtains 10% improvement over state-of-the-art systems demonstrating its effectiveness for the low-resource setting. ",
+    "title": "Adaptive Self-training for Neural Sequence Labeling with Few Labels",
+    "authors": [
+      "Yaqing Wang",
+      "Subhabrata Mukherjee",
+      "Haoda Chu",
+      "Yuancheng Tu",
+      "Ming Wu",
+      "Jing Gao",
+      "Ahmed Hassan Awadallah"
+    ],
+    "emails": [
+      "~Yaqing_Wang1",
+      "~Subhabrata_Mukherjee2",
+      "~Jing_Gao1",
+      "microsoft.com",
+      "~Ahmed_Hassan_Awadallah1"
+    ],
+    "rank": 1163
+  },
+  {
+    "url": "https://openreview.net/forum?id=dJbf5SqbFrM",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transfer learning has been successfully applied across many high-impact applications. However, most existing work focuses on the static transfer learning setting, and very little is devoted to modeling the time evolving target domain, such as the online reviews for movies. To bridge this gap, in this paper, we focus on the continuous transfer learning setting with a time evolving target domain. One major challenge associated with continuous transfer learning is the time evolving relatedness of the source domain and the current target domain as the target domain evolves over time. To address this challenge, we first derive a generic generalization error bound on the current target domain with flexible domain discrepancy measures. Furthermore, a novel label-informed C-divergence is proposed to measure the shift of joint data distributions (over input features and output labels) across domains. It could be utilized to instantiate a tighter error upper bound in the continuous transfer learning setting, thus motivating us to develop an adversarial Variational Auto-encoder algorithm named CONTE by minimizing the C-divergence based error upper bound. Extensive experiments on various data sets demonstrate the effectiveness of our CONTE algorithm.\n",
+    "title": "Continuous Transfer Learning",
+    "authors": [
+      "Jun Wu",
+      "Jingrui He"
+    ],
+    "emails": [
+      "~Jun_Wu3",
+      "~Jingrui_He1"
+    ],
+    "rank": 1164
+  },
+  {
+    "url": "https://openreview.net/forum?id=2_Z6MECjPEa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      7,
+      3,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We introduce foveated perceptual systems -- a hybrid architecture inspired by human vision, to explore the role of a \\textit{texture-based} foveation stage on the nature and robustness of subsequently learned visual representation in machines. Specifically, these two-stage perceptual systems first foveate an image, inducing a texture-like encoding of peripheral information -- mimicking the effects of \\textit{visual crowding} --  which is then relayed through a convolutional neural network (CNN) trained to perform scene categorization. We find that these foveated perceptual systems learn a visual representation that is \\textit{distinct} from their non-foveated counterpart through experiments that probe: 1) i.i.d and o.o.d generalization; 2) robustness to occlusion; 3) a center image bias; and 4) high spatial frequency sensitivity. In addition, we examined the impact of this foveation transform with respect to two additional models derived with a rate-distortion optimization procedure to compute matched-resource systems: a lower resolution non-foveated system, and a foveated system with adaptive Gaussian blurring. The properties of greater i.i.d generalization, high spatial frequency sensitivity, and robustness to occlusion emerged exclusively in our foveated texture-based models, independent of network architecture and learning dynamics. Altogether, these results demonstrate that foveation -- via peripheral texture-based computations -- yields a distinct and robust representational format of scene information relative to standard machine vision approaches, and also provides symbiotic computational support that texture-based peripheral encoding has important representational consequences for processing in the human visual system.\n",
+    "title": "Emergent Properties of Foveated Perceptual Systems",
+    "authors": [
+      "Arturo Deza",
+      "Talia Konkle"
+    ],
+    "emails": [
+      "harvard.edu",
+      "~Arturo_Deza1"
+    ],
+    "rank": 1165
+  },
+  {
+    "url": "https://openreview.net/forum?id=L2LEB4vd9Qw",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We address the problem of scene layout generation for diverse domains such as images, mobile applications, documents and 3D objects. Most complex scenes, natural or human-designed, can be expressed as a meaningful arrangement of simpler compositional graphical primitives. Generating a new layout or extending an existing layout requires understanding the relationships between these primitives. To do this, we propose a multimodal attention framework, MMA, that leverages self-attention to learn contextual relationships between layout elements and generate novel layouts in a given domain. Our framework allows us to generate a new layout either from an empty set or from an initial seed set of primitives, and can easily scale to support an arbitrary of primitives per layout. Further, our analyses show that the model is able to automatically capture the semantic properties of the primitives. We propose simple improvements in both representation of layout primitives, as well as training methods to demonstrate competitive performance in very diverse data domains such as object bounding boxes in natural images (COCO bounding boxes), documents (PubLayNet), mobile applications (RICO dataset) as well as 3D shapes (PartNet).",
+    "title": "Multimodal Attention for Layout Synthesis in Diverse Domains",
+    "authors": [
+      "Kamal Gupta",
+      "Vijay Mahadevan",
+      "Alessandro Achille",
+      "Justin Lazarow",
+      "Larry S. Davis",
+      "Abhinav Shrivastava"
+    ],
+    "emails": [
+      "~Abhinav_Shrivastava2",
+      "~Justin_Lazarow1",
+      "~Kamal_Gupta1",
+      "~Alessandro_Achille1",
+      "~Larry_S._Davis1",
+      "~Vijay_Mahadevan1"
+    ],
+    "rank": 1166
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZpS34ymonwE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recently demonstrated physical-world adversarial attacks have exposed vulnerabilities in perception systems that pose severe risks for safety-critical applications such as autonomous driving. These attacks place adversarial artifacts in the physical world that indirectly cause the addition of universal perturbations to inputs of a model that can fool it in a variety of contexts. Adversarial training is the most effective defense against image-dependent adversarial attacks. However, tailoring adversarial training to universal perturbations is computationally expensive since the optimal universal perturbations depend on the model weights which change during training. We propose meta adversarial training (MAT), a novel combination of adversarial training with meta-learning, which overcomes this challenge by meta-learning universal perturbations along with model training. MAT requires little extra computation while continuously adapting a large set of perturbations to the current model. We present results for universal patch and universal perturbation attacks on image classification and traffic-light detection. MAT considerably increases robustness against universal patch attacks compared to prior work.  ",
+    "title": "Meta Adversarial Training",
+    "authors": [
+      "Jan Hendrik Metzen",
+      "Nicole Finnie",
+      "Robin Hutmacher"
+    ],
+    "emails": [
+      "~Jan_Hendrik_Metzen1",
+      "~Nicole_Finnie2",
+      "~Robin_Hutmacher1"
+    ],
+    "rank": 1167
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZqB2GD-Ixn",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      9,
+      5,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      5,
+      4,
+      4,
+      2
+    ],
+    "abstract": "The ability to extrapolate, or generalize, from observed to new related environments is central to any form of reliable machine learning, yet most methods fail when moving beyond i.i.d data. In some cases, the reason lies in a misappreciation of the causal structure that governs the data, and in particular as a consequence of the influence of unobserved confounders that drive changes in observed distributions and distort correlations. In this paper, we argue for defining generalization with respect to a broader class of distribution shifts (defined as arising from interventions in the underlying causal model), including changes in observed, unobserved and target variable distributions. We propose a new robust learning principle that may be paired with any gradient-based learning algorithm. This learning principle has explicit generalization guarantees, and relates robustness with certain invariances in the causal model, clarifying why, in some cases, test performance lags training performance. We demonstrate the empirical performance of our approach on healthcare data from different modalities, including image and speech data.",
+    "title": "Accounting for Unobserved Confounding in Domain Generalization",
+    "authors": [
+      "Alexis Bellot",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Alexis_Bellot1",
+      "~Mihaela_van_der_Schaar2"
+    ],
+    "rank": 1168
+  },
+  {
+    "url": "https://openreview.net/forum?id=GafvgJTFkgb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "The conversation around the fairness of machine learning models is growing and evolving. In this work, we focus on the issue of bias amplification: the tendency of models trained from data containing social biases to further amplify these biases. This problem is brought about by the algorithm, on top of the level of bias already present in the data. We make two main contributions regarding its measurement. First, building off of Zhao et al. (2017), we introduce and analyze a new, decoupled metric for measuring bias amplification, $\\text{BiasAmp}_{\\rightarrow}, which possesses a number of attractive properties, including the ability to pinpoint the cause of bias amplification. Second, we thoroughly analyze and discuss the normative implications of this metric. We provide suggestions about its measurement by cautioning against predicting sensitive attributes, encouraging the use of confidence intervals due to fluctuations in the fairness of models across runs, and discussing what bias amplification means in the context of domains where labels either don't exist at test time or correspond to uncertain future events. Throughout this paper, we work to provide a deeply interrogative look at the technical measurement of bias amplification, guided by our normative ideas of what we want it to encompass.",
+    "title": "A Technical and Normative Investigation of Social Bias Amplification",
+    "authors": [
+      "Angelina Wang",
+      "Olga Russakovsky"
+    ],
+    "emails": [
+      "~Olga_Russakovsky1",
+      "~Angelina_Wang1"
+    ],
+    "rank": 1169
+  },
+  {
+    "url": "https://openreview.net/forum?id=7TBP8k7TLFA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Group symmetry is inherent in a wide variety of data distributions. Data processing that preserves symmetry is described as an equivariant map and often effective in achieving high performance. Convolutional neural networks (CNNs) have been known as models with equivariance and shown to approximate equivariant maps for some specific groups. However, universal approximation theorems for CNNs have been separately derived with individual techniques according to each group and setting. This paper provides a unified method to obtain universal approximation theorems for equivariant maps by CNNs in various settings. As its significant advantage, we can handle non-linear equivariant maps between infinite-dimensional spaces for non-compact groups.",
+    "title": "Universal Approximation Theorem for Equivariant Maps by Group CNNs",
+    "authors": [
+      "Wataru Kumagai",
+      "Akiyoshi Sannai"
+    ],
+    "emails": [
+      "~Wataru_Kumagai2",
+      "~Akiyoshi_Sannai1"
+    ],
+    "rank": 1170
+  },
+  {
+    "url": "https://openreview.net/forum?id=BtZhsSGNRNi",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      6
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "The label shift problem refers to the supervised learning setting where the train and test label distributions do not match. Existing work addressing label shift usually assumes access to an unlabelled test sample. This sample may be used to estimate the test label distribution, and to then train a suitably re-weighted classifier. While approaches using this idea have proven effective, their scope is limited as it is not always feasible to access the target domain; further, they require repeated retraining if the model is to be deployed in multiple test environments. Can one instead learn a single classifier that is robust to arbitrary label shifts from a broad family? In this paper, we answer this question by proposing a model that minimises an objective based on distributionally robust optimisation (DRO). We then design and analyse a gradient descent-proximal mirror ascent algorithm tailored for large-scale problems to optimise the proposed objective.  Finally, through experiments on CIFAR-100 and ImageNet, we show that our technique can significantly improve performance over a number of baselines in settings where label shift is present.",
+    "title": "Coping with Label Shift via Distributionally Robust Optimisation",
+    "authors": [
+      "Jingzhao Zhang",
+      "Aditya Krishna Menon",
+      "Andreas Veit",
+      "Srinadh Bhojanapalli",
+      "Sanjiv Kumar",
+      "Suvrit Sra"
+    ],
+    "emails": [
+      "~Suvrit_Sra1",
+      "~Jingzhao_Zhang2",
+      "~Andreas_Veit1",
+      "~Srinadh_Bhojanapalli1",
+      "~Sanjiv_Kumar1",
+      "~Aditya_Krishna_Menon1"
+    ],
+    "rank": 1171
+  },
+  {
+    "url": "https://openreview.net/forum?id=PdauS7wZBfC",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The backpropagation of error (backprop) is a powerful algorithm for training machine learning architectures through end-to-end differentiation. Recently it has been shown that backprop in multilayer-perceptrons (MLPs) can be approximated using predictive coding, a biologically-plausible process theory of cortical computation which relies solely on local and Hebbian updates. The power of backprop, however, lies not in its instantiation in MLPs, but rather in the concept of automatic differentiation which allows for the optimisation of any differentiable program expressed as a computation graph. Here, we demonstrate that predictive coding converges asymptotically (and in practice rapidly) to exact backprop gradients on arbitrary computation graphs using only local learning rules. We apply this result to develop a straightforward strategy to translate core machine learning architectures into their predictive coding equivalents. We construct predictive coding CNNs, RNNs, and the more complex LSTMs, which include a non-layer-like branching internal graph structure and multiplicative interactions. Our models perform equivalently to backprop on challenging machine learning benchmarks, while utilising only local and (mostly) Hebbian plasticity. Our method raises the potential that standard machine learning algorithms could in principle be directly implemented in neural circuitry, and may also contribute to the development of completely distributed neuromorphic architectures.",
+    "title": "Predictive Coding Approximates Backprop along Arbitrary Computation Graphs",
+    "authors": [
+      "Beren Millidge",
+      "Alexander Tschantz",
+      "Christopher Buckley"
+    ],
+    "emails": [
+      "~Christopher_Buckley1",
+      "~Alexander_Tschantz1",
+      "~Beren_Millidge1"
+    ],
+    "rank": 1172
+  },
+  {
+    "url": "https://openreview.net/forum?id=euDnVs0Ynts",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      5,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We study the problem of learning Bayesian networks where an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-fraction of the samples are adversarially corrupted.  We focus on the fully-observable case where the underlying graph structure is known.  In this work, we present the first nearly-linear time algorithm for this problem with a dimension-independent error guarantee.  Previous robust algorithms with comparable error guarantees are slower by at least a factor of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mi>d</mi><mrow><mo>/</mo></mrow><mi>\u03f5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> is the number of variables in the Bayesian network and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> is the fraction of corrupted samples.\n\nOur algorithm and analysis are considerably simpler than those in previous work.  We achieve this by establishing a direct connection between robust learning of Bayesian networks and robust mean estimation.  As a subroutine in our algorithm, we develop a robust mean estimation algorithm whose runtime is nearly-linear in the number of nonzeros in the input samples, which may be of independent interest.",
+    "title": "Robust Learning of Fixed-Structure Bayesian Networks in Nearly-Linear Time",
+    "authors": [
+      "Yu Cheng",
+      "Honghao Lin"
+    ],
+    "emails": [
+      "~Yu_Cheng2",
+      "~Honghao_Lin1"
+    ],
+    "rank": 1173
+  },
+  {
+    "url": "https://openreview.net/forum?id=2CjEVW-RGOJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.67",
+    "confidences": [
+      3,
+      5,
+      2,
+      5
+    ],
+    "abstract": "We introduce Skip-Window, a method to allow recurrent neural networks (RNNs) to trade off accuracy for computational cost during the analysis of a sequence. Similarly to existing approaches, Skip-Window extends existing RNN cells by adding a mechanism to encourage the model to process fewer inputs. Unlike existing approaches, Skip-Window is able to respect a strict computational budget, making this model more suitable for limited hardware. We evaluate this approach on two datasets: a human activity recognition task and adding task. Our results show that Skip-Window is able to exceed the accuracy of existing approaches for a lower computational cost while strictly limiting said cost.",
+    "title": "SkipW: Resource Adaptable RNN with Strict Upper Computational Limit",
+    "authors": [
+      "Tsiry Mayet",
+      "Anne Lambert",
+      "Pascal Leguyadec",
+      "Francoise Le Bolzer",
+      "Fran\u00e7ois Schnitzler"
+    ],
+    "emails": [
+      "interdigital.com",
+      "gmail.com",
+      "~Fran\u00e7ois_Schnitzler1"
+    ],
+    "rank": 1174
+  },
+  {
+    "url": "https://openreview.net/forum?id=K9bw7vqp_s",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Sparsity in Deep Neural Networks (DNNs) has been widely studied to compress and accelerate the models on resource-constrained environments. It can be generally categorized into unstructured fine-grained sparsity that zeroes out multiple individual weights distributed across the neural network, and structured coarse-grained sparsity which prunes blocks of sub-networks of a neural network. Fine-grained sparsity can achieve a high compression ratio but is not hardware friendly and hence receives limited speed gains. On the other hand, coarse-grained sparsity cannot simultaneously achieve both apparent acceleration on modern GPUs and\ndecent performance. In this paper, we are the first to study training from scratch an N:M fine-grained structured sparse network, which can maintain the advantages of both unstructured fine-grained sparsity and structured coarse-grained sparsity simultaneously on specifically designed GPUs. Specifically, a 2 : 4 sparse network could achieve 2\u00d7 speed-up without performance drop on Nvidia A100 GPUs. Furthermore, we propose a novel and effective ingredient, sparse-refined straight-through estimator (SR-STE), to alleviate the negative influence of the approximated gradients computed by vanilla STE during optimization. We also define a metric, Sparse Architecture Divergence (SAD), to measure the sparse network\u2019s topology change during the training process. Finally, We justify SR-STE\u2019s advantages with SAD and demonstrate the effectiveness of SR-STE by performing\ncomprehensive experiments on various tasks. Anonymous code and model will be at available at <a href=\"https://github.com/anonymous-NM-sparsity/NM-sparsity\" target=\"_blank\" rel=\"nofollow\">https://github.com/anonymous-NM-sparsity/NM-sparsity</a>.",
+    "title": "Learning N:M  Fine-grained Structured Sparse Neural Networks From Scratch",
+    "authors": [
+      "Aojun Zhou",
+      "Yukun Ma",
+      "Junnan Zhu",
+      "Jianbo Liu",
+      "Zhijie Zhang",
+      "Kun Yuan",
+      "Wenxiu Sun",
+      "Hongsheng Li"
+    ],
+    "emails": [
+      "~Jianbo_Liu3",
+      "~Kun_Yuan1",
+      "~Yukun_Ma2",
+      "~Zhijie_Zhang1",
+      "~Wenxiu_Sun1",
+      "~Hongsheng_Li3",
+      "nlpr.ia.ac.cn",
+      "~Aojun_Zhou2"
+    ],
+    "rank": 1175
+  },
+  {
+    "url": "https://openreview.net/forum?id=ASAJvUPWaDI",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present an efficient and scalable algorithm for debiasing trained models, including deep neural networks (DNNs), which we prove to be near-optimal by bounding its excess Bayes risk.  Unlike previous black-box reduction methods to cost-sensitive classification rules, the proposed algorithm operates on models that have been trained without having to retrain the model. Furthermore, as the algorithm is based on projected stochastic gradient descent (SGD), it is particularly attractive for deep learning applications. We empirically validate the proposed algorithm on standard benchmark datasets across both classical algorithms and modern DNN architectures and demonstrate that it outperforms previous post-processing approaches for unbiased classification.",
+    "title": "A Near-Optimal Recipe for Debiasing Trained Machine Learning Models",
+    "authors": [
+      "Ibrahim Alabdulmohsin",
+      "Mario Lucic"
+    ],
+    "emails": [
+      "~Ibrahim_Alabdulmohsin1",
+      "~Mario_Lucic1"
+    ],
+    "rank": 1176
+  },
+  {
+    "url": "https://openreview.net/forum?id=kvhzKz-_DMF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      8,
+      4,
+      4
+    ],
+    "rating": "5.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "While large-scale pretrained language models have obtained impressive results when fine-tuned on a wide variety of tasks, they still often suffer from overfitting in low-resource scenarios. Since such models are general-purpose feature extractors, many of these features are inevitably irrelevant for a given target task.  We propose to use Variational Information Bottleneck (VIB) to suppress irrelevant features when fine-tuning on low-resource target tasks, and show that our method successfully reduces overfitting.  Moreover, we show that our VIB model finds sentence representations that are more robust to biases in natural language inference datasets, and thereby obtains better generalization to out-of-domain datasets. Evaluation on seven low-resource datasets in different tasks shows that our method significantly improves transfer learning in low-resource scenarios, surpassing prior work. Moreover, it improves generalization on 13 out of 15 out-of-domain natural language inference benchmarks.  Our code is publicly available in <a href=\"https://github.com/rabeehk/vibert\" target=\"_blank\" rel=\"nofollow\">https://github.com/rabeehk/vibert</a>.",
+    "title": "Variational Information Bottleneck for Effective Low-Resource Fine-Tuning",
+    "authors": [
+      "Rabeeh Karimi mahabadi",
+      "Yonatan Belinkov",
+      "James Henderson"
+    ],
+    "emails": [
+      "~Rabeeh_Karimi_mahabadi2",
+      "~Yonatan_Belinkov1",
+      "~James_Henderson1"
+    ],
+    "rank": 1177
+  },
+  {
+    "url": "https://openreview.net/forum?id=ueiBFzt7CiK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7
+    ],
+    "rating": "5.67",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Recently there is a surge of interests in using graph neural networks (GNNs) to learn algorithms. However, these works focus more on imitating existing algorithms, and are limited in two important aspects: the search space for algorithms is too small and the learned GNN models are not interpretable. To address these issues, we propose a novel framework which enlarge the search space using cheap global information from tree decomposition of the graphs, and can explain the structures of the graph leading to the decision of learned algorithms. We apply our framework to three NP-complete problems on graphs and show that the framework is able to discover effective and explainable algorithms. ",
+    "title": "A Framework For Differentiable Discovery Of Graph Algorithms",
+    "authors": [
+      "Hanjun Dai",
+      "Xinshi Chen",
+      "Yu Li",
+      "Xin Gao",
+      "Le Song"
+    ],
+    "emails": [
+      "~Xin_Gao1",
+      "~Yu_Li1",
+      "~Xinshi_Chen1",
+      "~Le_Song1",
+      "~Hanjun_Dai1"
+    ],
+    "rank": 1178
+  },
+  {
+    "url": "https://openreview.net/forum?id=oxnp2q-PGL4",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.67",
+    "confidences": [
+      1,
+      3,
+      2
+    ],
+    "abstract": "Lifting is an efficient technique to scale up graphical models generalized to relational domains by exploiting the underlying symmetries. Concurrently, neural models are continuously expanding from grid-like tensor data into structured representations, such as various attributed graphs and relational databases. To address the irregular structure of the data, the models typically extrapolate on the idea of convolution, effectively introducing parameter sharing in their, dynamically unfolded, computation graphs. The computation graphs themselves then reflect the symmetries of the underlying data, similarly to the lifted graphical models. Inspired by lifting, we introduce a simple and efficient technique to detect the symmetries and compress the neural models without loss of any information. We demonstrate through experiments that such compression can lead to significant speedups of structured convolutional models, such as various Graph Neural Networks, across various tasks, such as molecule classification and knowledge-base completion.",
+    "title": "Lossless Compression of Structured Convolutional Models via Lifting",
+    "authors": [
+      "Gustav Sourek",
+      "Filip Zelezny",
+      "Ondrej Kuzelka"
+    ],
+    "emails": [
+      "~Ondrej_Kuzelka1",
+      "~Gustav_Sourek1",
+      "fel.cvut.cz"
+    ],
+    "rank": 1179
+  },
+  {
+    "url": "https://openreview.net/forum?id=GNv-TyWu3PY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7,
+      8
+    ],
+    "rating": "5.67",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We consider the problem of routing users through a network with unknown congestion functions over an infinite time horizon. On each time step <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container>, the algorithm receives a routing request and must select a valid path. For each edge <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>e</mi></math></mjx-assistive-mml></mjx-container> in the selected path, the algorithm incurs a cost <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.247em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.29em;\"></mjx-spacer><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msubsup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-msub space=\"4\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.247em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.29em;\"></mjx-spacer><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msubsup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-msubsup space=\"3\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.247em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.29em;\"></mjx-spacer><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msubsup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msubsup><mi>c</mi><mi>e</mi><mi>t</mi></msubsup><mo>=</mo><msub><mi>f</mi><mi>e</mi></msub><mo stretchy=\"false\">(</mo><msubsup><mi>x</mi><mi>e</mi><mi>t</mi></msubsup><mo stretchy=\"false\">)</mo><mo>+</mo><msubsup><mi>\u03b7</mi><mi>e</mi><mi>t</mi></msubsup></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.247em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.29em;\"></mjx-spacer><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msubsup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msubsup><mi>x</mi><mi>e</mi><mi>t</mi></msubsup></math></mjx-assistive-mml></mjx-container> is the flow on edge <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>e</mi></math></mjx-assistive-mml></mjx-container> at time <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>f</mi><mi>e</mi></msub></math></mjx-assistive-mml></mjx-container> is the congestion function, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.247em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.29em;\"></mjx-spacer><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msubsup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msubsup><mi>\u03b7</mi><mi>e</mi><mi>t</mi></msubsup></math></mjx-assistive-mml></mjx-container> is a noise sample drawn from an unknown distribution. The algorithm observes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.247em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.29em;\"></mjx-spacer><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msubsup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msubsup><mi>c</mi><mi>e</mi><mi>t</mi></msubsup></math></mjx-assistive-mml></mjx-container>, and can use this observation in future routing decisions. The routing requests are supplied adversarially. \n\nWe present an algorithm with cumulative regret <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>E</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msup><mi>t</mi><mrow><mn>2</mn><mrow><mo>/</mo></mrow><mn>3</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where the regret on each time step is defined as the difference between the total cost incurred by our chosen path and the minimum cost among all valid paths. Our algorithm has space complexity <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>E</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msup><mi>t</mi><mrow><mn>1</mn><mrow><mo>/</mo></mrow><mn>3</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and time complexity <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>E</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>t</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. We also validate our algorithm empirically using graphs from New York City road networks.",
+    "title": "Robust Learning for Congestion-Aware Routing",
+    "authors": [
+      "Sreenivas Gollapudi",
+      "Kostas Kollias",
+      "Benjamin Plaut",
+      "Ameya Velingker"
+    ],
+    "emails": [
+      "~Sreenivas_Gollapudi2",
+      "google.com",
+      "~Ameya_Velingker1",
+      "~Benjamin_Plaut2"
+    ],
+    "rank": 1180
+  },
+  {
+    "url": "https://openreview.net/forum?id=uELnyih9gqb",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7,
+      4
+    ],
+    "rating": "5.65",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Deep quantization of neural networks below eight bits can lead to superlinear benefits in storage and compute efficiency. However, homogeneously quantizing all the layers to the same level does not account for the distinction of the layers and their individual properties. Heterogenous assignment of bitwidths to individual layers is attractive but opens an exponentially large non-contiguous hyperparameter space (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.443em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c23\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi>A</mi><mi>v</mi><mi>a</mi><mi>i</mi><mi>l</mi><mi>a</mi><mi>b</mi><mi>l</mi><mi>e</mi><mi>B</mi><mi>i</mi><mi>t</mi><mi>w</mi><mi>i</mi><mi>d</mi><mi>t</mi><mi>h</mi><mi>s</mi></mrow><mrow><mi mathvariant=\"normal\">#</mi><mi>L</mi><mi>a</mi><mi>y</mi><mi>e</mi><mi>r</mi><mi>s</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>). As such finding the bitwidth while also quantizing the network to those levels becomes a major challenge. This paper addresses this challenge through a sinusoidal regularization mechanism, dubbed WaveQ. Adding our parametrized sinusoidal regularizer enables us to not only find the quantized weights but also learn the bitwidth of the layers by making the period of the sinusoidal regularizer a trainable parameter. In addition, the sinusoidal regularizer itself is designed to align its minima on the quantization levels. With these two innovations, during training, stochastic gradient descent uses the form of the sinusoidal regularizer and its minima to push the weights to the quantization levels while it is also learning the period which will determine the bitwidth of each layer separately. As such WaveQ is a gradient-based mechanism that jointly learns the quantized weights as well as the heterogeneous bitwidths. We show how WaveQ balance compute efficiency and accuracy, and provide a heterogeneous bitwidth assignment for quantization of a large variety of deep networks (AlexNet, CIFAR-10, MobileNet, ResNet-18, ResNet-20, SVHN, and VGG-11) that virtually preserves the accuracy. WaveQ is versatile and can also be used with predetermined bitwidths by fixing the period of the sinusoidal regularizer. In this case. WaveQ enhances quantized training algorithms (DoReFa and WRPN) with about 4.8% accuracy improvements on average, and outperforms multiple state-of-the-art techniques. Finally, WaveQ applied to quantizing transformers\n",
+    "title": "WAVEQ: GRADIENT-BASED DEEP QUANTIZATION OF NEURAL NETWORKS THROUGH SINUSOIDAL REGULARIZATION",
+    "authors": [
+      "Ahmed T. Elthakeb",
+      "Prannoy Pilligundla",
+      "Tarek Elgindi",
+      "Fatemehsadat Mireshghallah",
+      "Charles-Alban Deledalle",
+      "Hadi Esmaeilzadeh"
+    ],
+    "emails": [
+      "~Hadi_Esmaeilzadeh1",
+      "~Fatemehsadat_Mireshghallah1",
+      "ucsd.edu",
+      "~Charles-Alban_Deledalle2",
+      "~Prannoy_Pilligundla1",
+      "~Ahmed_T._Elthakeb1"
+    ],
+    "rank": 1181
+  },
+  {
+    "url": "https://openreview.net/forum?id=3ZeGLibhFo0",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      7
+    ],
+    "rating": "5.65",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Balanced representation learning methods have been applied successfully to counterfactual\ninference from observational data. However, approaches that account for\nsurvival outcomes are relatively limited. Survival data are frequently encountered\nacross diverse medical applications, i.e., drug development, risk profiling, and clinical\ntrials, and such data are also relevant in fields like manufacturing (for equipment\nmonitoring). When the outcome of interest is time-to-event, special precautions\nfor handling censored events need to be taken, as ignoring censored outcomes may\nlead to biased estimates. We propose a theoretically grounded unified framework\nfor counterfactual inference applicable to survival outcomes. Further, we formulate\na nonparametric hazard ratio metric for evaluating average and individualized\ntreatment effects. Experimental results on real-world and semi-synthetic datasets,\nthe latter which we introduce, demonstrate that the proposed approach significantly\noutperforms competitive alternatives in both survival-outcome predictions and\ntreatment-effect estimation.",
+    "title": "Enabling counterfactual survival analysis with balanced representations",
+    "authors": [
+      "Paidamoyo Chapfuwa",
+      "Serge Assaad",
+      "Shuxi Zeng",
+      "Michael Pencina",
+      "Lawrence Carin",
+      "Ricardo Henao"
+    ],
+    "emails": [
+      "duke.edu",
+      "~Ricardo_Henao1",
+      "gmail.com",
+      "~Paidamoyo_Chapfuwa1",
+      "~Serge_Assaad1",
+      "~Lawrence_Carin2"
+    ],
+    "rank": 1182
+  },
+  {
+    "url": "https://openreview.net/forum?id=Zc36Mbb8G6",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      7,
+      6
+    ],
+    "rating": "5.65",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent advances in generative adversarial networks (GANs) have shown remarkable progress in generating high-quality images. However, this gain in performance depends on the availability of a large amount of training data. In limited data regimes, training typically diverges, and therefore the generated samples are of low quality and lack diversity. Previous works have addressed training in low data setting by leveraging transfer learning and data augmentation techniques. We propose a novel transfer learning method for GANs in the limited data domain by leveraging informative data prior derived from self-supervised/supervised pre-trained networks trained on a diverse source domain. We perform experiments on several standard vision datasets using various GAN architectures (BigGAN, SNGAN, StyleGAN2) to demonstrate that the proposed method effectively transfers knowledge to domains with few target images, outperforming existing state-of-the-art techniques in terms of image quality and diversity. We also show the utility of data instance prior in large-scale unconditional image generation and image editing tasks.",
+    "title": "Data Instance Prior for Transfer Learning in GANs",
+    "authors": [
+      "Puneet Mangla",
+      "Nupur Kumari",
+      "Mayank Singh",
+      "Vineeth N. Balasubramanian",
+      "Balaji Krishnamurthy"
+    ],
+    "emails": [
+      "~Puneet_Mangla1",
+      "~Nupur_Kumari1",
+      "~Mayank_Singh2",
+      "~Balaji_Krishnamurthy1",
+      "~Vineeth_N._Balasubramanian2"
+    ],
+    "rank": 1183
+  },
+  {
+    "url": "https://openreview.net/forum?id=bjkX6Kzb5H",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.65",
+    "confidences": [
+      2,
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Constructing large, labeled training datasets for segmentation models is an expensive and labor-intensive process. This is a common challenge in machine learning, addressed by methods that require few or no labeled data points such as few-shot learning (FSL) and weakly-supervised learning (WS). Such techniques, however, have limitations when applied to image segmentation---FSL methods often produce noisy results and are strongly dependent on which few datapoints are labeled, while WS models struggle to fully exploit rich image information. We propose a framework that fuses FSL and WS for segmentation tasks, enabling users to train high-performing segmentation networks with very few hand-labeled training points. We use FSL models as weak sources in a WS framework, requiring a very small set of reference labeled images, and introduce a new WS model that focuses on key areas---areas with contention among noisy labels---of the image to fuse these weak sources. Empirically, we evaluate our proposed approach over seven well-motivated segmentation tasks. We show that our methods can achieve within 1.4 Dice points compared to fully supervised networks while only requiring five hand-labeled training points. Compared to existing FSL methods, our approach improves performance by a mean 3.6 Dice points over the next-best method. ",
+    "title": "Cut out the annotator, keep the cutout: better segmentation with weak supervision",
+    "authors": [
+      "Sarah Hooper",
+      "Michael Wornow",
+      "Ying Hang Seah",
+      "Peter Kellman",
+      "Hui Xue",
+      "Frederic Sala",
+      "Curtis Langlotz",
+      "Christopher Re"
+    ],
+    "emails": [
+      "~Curtis_Langlotz1",
+      "stanford.edu",
+      "nhlbi.nih.gov",
+      "~Sarah_Hooper1",
+      "~Christopher_Re1",
+      "~Frederic_Sala1",
+      "nih.gov"
+    ],
+    "rank": 1184
+  },
+  {
+    "url": "https://openreview.net/forum?id=WEHSlH5mOk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Time series forecasting is an extensively studied subject in statistics, economics, and computer science. Exploration of the correlation and causation among the variables in a multivariate time series shows promise in enhancing the performance of a time series model. When using deep neural networks as forecasting models, we hypothesize that exploiting the pairwise information among multiple (multivariate) time series also improves their forecast. If an explicit graph structure is known, graph neural networks (GNNs) have been demonstrated as powerful tools to exploit the structure. In this work, we propose learning the structure simultaneously with the GNN if the graph is unknown. We cast the problem as learning a probabilistic graph model through optimizing the mean performance over the graph distribution. The distribution is parameterized by a neural network so that discrete graphs can be sampled differentiably through reparameterization. Empirical evaluations show that our method is simpler, more efficient, and better performing than a recently proposed bilevel learning approach for graph structure learning, as well as a broad array of forecasting models, either deep or non-deep learning based, and graph or non-graph based.",
+    "title": "Discrete Graph Structure Learning for Forecasting Multiple Time Series",
+    "authors": [
+      "Chao Shang",
+      "Jie Chen",
+      "Jinbo Bi"
+    ],
+    "emails": [
+      "~Chao_Shang1",
+      "~Jinbo_Bi1",
+      "~Jie_Chen1"
+    ],
+    "rank": 1185
+  },
+  {
+    "url": "https://openreview.net/forum?id=zUMD--Fb9Bt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.64",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Graph Convolutional Networks (GCNs) have attracted a lot of research interest in the machine learning community in recent years. Although many variants have been proposed, we still lack a systematic view of different GCN models and deep understanding of the relations among them. In this paper, we take a step forward to establish a unified framework for convolution-based graph neural networks, by formulating the basic graph convolution operation as an optimization problem in the graph Fourier space. Under this framework, a variety of popular GCN models, including the vanilla-GCNs, attention-based GCNs and topology-based GCNs, can be interpreted as a same optimization problem but with different carefully designed regularizers. This novel perspective enables a better understanding of the similarities and differences among many widely used GCNs, and may inspire new approaches for designing better models. As a showcase, we also present a novel regularization technique under the proposed framework to tackle the oversmoothing problem in graph convolution. The effectiveness of the newly designed model is validated empirically.",
+    "title": "A Unified Framework for Convolution-based Graph Neural Networks",
+    "authors": [
+      "Xuran Pan",
+      "Shiji Song",
+      "Gao Huang"
+    ],
+    "emails": [
+      "~Gao_Huang1",
+      "~Xuran_Pan1",
+      "~Shiji_Song1"
+    ],
+    "rank": 1186
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZVBtN6B_6i7",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Attention mechanisms have shown promising results in sequence modeling tasks that require long-term memory. Recent work has investigated mechanisms to reduce the computational cost of preserving and storing the memories. However, not all content in the past is equally important to remember. We propose Expire-Span, a method that learns to retain the most important information and \nexpire the irrelevant information. This enables Transformers to scale to attend to tens of thousands of previous timesteps efficiently, as not all hidden states from previous timesteps are preserved. We demonstrate that Expire-Span can help models identify and retain critical information and show it can achieve state of the art results on long-context language modeling, reinforcement learning, and algorithmic tasks. Finally, we show that Expire-Span can scale to memories that are tens of thousands in size, which is helpful on incredibly long context tasks such as character-level PG-19 and a frame-by-frame moving objects task.\n",
+    "title": "Not All Memories are Created Equal: Learning to Expire",
+    "authors": [
+      "Sainbayar Sukhbaatar",
+      "Da JU",
+      "Spencer Poff",
+      "Stephen Roller",
+      "Arthur Szlam",
+      "Jason E Weston",
+      "Angela Fan"
+    ],
+    "emails": [
+      "~Angela_Fan2",
+      "~Sainbayar_Sukhbaatar1",
+      "~Arthur_Szlam1",
+      "~Da_JU1",
+      "fb.com",
+      "~Jason_E_Weston1",
+      "~Stephen_Roller1"
+    ],
+    "rank": 1187
+  },
+  {
+    "url": "https://openreview.net/forum?id=cy0jU8F60Hy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "We propose conditional transport (CT) as a new divergence to measure the difference between two probability distributions. The CT divergence consists of the expected cost of a forward CT, which constructs a navigator to stochastically transport a data point of one distribution to the other distribution, and that of a backward CT which reverses the transport direction. To apply it to the distributions whose probability density functions are unknown but random samples are accessible, we further introduce asymptotic CT (ACT), whose estimation only requires access to mini-batch based discrete empirical distributions. Equipped with two navigators that amortize the computation of conditional transport plans, the ACT divergence comes with unbiased sample gradients that are straightforward to compute, making it amenable to mini-batch stochastic gradient descent based optimization. When applied to train a generative model, the ACT divergence is shown to strike a good balance between mode covering and seeking behaviors and strongly resist mode collapse. To model high-dimensional data, we show that it is sufficient to modify the adversarial game of an existing generative adversarial network (GAN) to a game played by a generator, a forward navigator, and a backward navigator, which try to minimize a distribution-to-distribution transport cost by optimizing both the distribution of the generator and conditional transport plans specified by the navigators, versus a critic that does the opposite by inflating the point-to-point transport cost. On a wide variety of benchmark datasets for generative modeling, substituting the default statistical distance of an existing GAN with the ACT divergence is shown to consistently improve the performance. ",
+    "title": "ACT: Asymptotic Conditional Transport",
+    "authors": [
+      "Huangjie Zheng",
+      "Mingyuan Zhou"
+    ],
+    "emails": [
+      "~Mingyuan_Zhou1",
+      "~Huangjie_Zheng1"
+    ],
+    "rank": 1188
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZglaBL5inu",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      8,
+      4
+    ],
+    "rating": "5.64",
+    "confidences": [
+      5,
+      2,
+      4,
+      3
+    ],
+    "abstract": "We use hyperbolic Poisson kernel to construct the horocycle neuron model on hyperbolic spaces, which is a spectral generalization of the classical neuron model. We prove a universal approximation theorem for horocycle neurons. As a corollary, this theorem leads to a state-of-the-art result on the expressivity of neurons of the hyperbolic MLR. Our experiments get state-of-the-art results on the Poincare-embedding tree classification task and the two-dimensional visualization of images.",
+    "title": "Laplacian Eigenspaces, Horocycles and Neuron Models on Hyperbolic Spaces",
+    "authors": [
+      "Ming-Xi Wang"
+    ],
+    "emails": [
+      "~Ming-Xi_Wang1"
+    ],
+    "rank": 1189
+  },
+  {
+    "url": "https://openreview.net/forum?id=oGq4d9TbyIA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Uniform-precision neural network quantization has gained popularity thanks to its simple arithmetic unit densely packed for high computing capability. However, it ignores heterogeneous sensitivity to the impact of quantization across the layers, resulting in sub-optimal inference accuracy. This work proposes a novel approach to adjust the network structure to alleviate the impact of uniform-precision quantization. The proposed neural architecture search selectively expands channels for the quantization sensitive layers while satisfying hardware constraints (e.g., FLOPs). We provide substantial insights and empirical evidence that the proposed search method called neural channel expansion can adapt several popular networks' channels to achieve superior 2-bit quantization accuracy on CIFAR10 and ImageNet. In particular, we demonstrate the best-to-date Top-1/Top-5 accuracy for 2-bit ResNet50 with smaller FLOPs and the parameter size.",
+    "title": "Uniform-Precision Neural Network Quantization via Neural Channel Expansion",
+    "authors": [
+      "Seongmin Park",
+      "Beomseok Kwon",
+      "Kyuyoung Sim",
+      "Jieun Lim",
+      "Tae-Ho Kim",
+      "Jungwook Choi"
+    ],
+    "emails": [
+      "hanyang.ac.kr",
+      "~Tae-Ho_Kim2",
+      "~Jungwook_Choi1",
+      "nota.ai"
+    ],
+    "rank": 1190
+  },
+  {
+    "url": "https://openreview.net/forum?id=Utc4Yd1RD_s",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "There is now extensive evidence demonstrating that deep neural networks are vulnerable to adversarial examples, motivating the development of defenses against adversarial attacks. However, existing adversarial defenses typically improve model robustness against individual specific perturbation types. Some recent methods improve model robustness against adversarial attacks in multiple <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> balls, but their performance against each perturbation type is still far from satisfactory. To better understand this phenomenon, we propose the \\emph{multi-domain} hypothesis, stating that different types of adversarial perturbations are drawn from different domains. Guided by the multi-domain hypothesis, we propose~\\emph{Gated Batch Normalization (GBN)}, a novel building block for deep neural networks that improves robustness against multiple perturbation types. GBN consists of a gated sub-network and a multi-branch batch normalization (BN) layer, where the gated sub-network separates different perturbation types, and each BN branch is in charge of a single perturbation type and learns domain-specific statistics for input transformation. Then, features from different branches are aligned as domain-invariant representations for the subsequent layers. We perform extensive evaluations of our approach on MNIST, CIFAR-10, and Tiny-ImageNet, and in doing so demonstrate that GBN outperforms previous defense proposals against multiple perturbation types, \\ie, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> perturbations, by large margins of 10-20\\%.",
+    "title": "Towards Defending Multiple Adversarial Perturbations via Gated Batch Normalization",
+    "authors": [
+      "Aishan Liu",
+      "Shiyu Tang",
+      "Xianglong Liu",
+      "Xinyun Chen",
+      "Lei Huang",
+      "Zhuozhuo Tu",
+      "Dawn Song",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Aishan_Liu1",
+      "~Dawn_Song1",
+      "~Xianglong_Liu2",
+      "~Dacheng_Tao1",
+      "~Shiyu_Tang1",
+      "~Zhuozhuo_Tu1",
+      "~Xinyun_Chen1",
+      "~Lei_Huang1"
+    ],
+    "rank": 1191
+  },
+  {
+    "url": "https://openreview.net/forum?id=IZQm8mMRVqW",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "The Heavy Ball Method, proposed by Polyak over five decades ago, is a first-order method for optimizing continuous functions. While its stochastic counterpart has proven extremely popular in training deep networks, there are almost no known functions where deterministic Heavy Ball is provably faster than the simple and classical gradient descent algorithm in non-convex optimization. The success of Heavy Ball has thus far eluded theoretical understanding.  Our goal is to address this gap, and in the present work we identify two non-convex problems where we provably show that the Heavy Ball momentum helps the iterate to enter a benign region that contains a global optimal point faster. We show that Heavy Ball exhibits simple dynamics that clearly reveal the benefit of using a larger value of momentum parameter for the problems. The first of these optimization problems is the phase retrieval problem, which has useful applications in physical science. The second of these optimization problems is the cubic-regularized minimization, a critical subroutine required by Nesterov-Polyak cubic-regularized method to find second-order stationary points in general smooth non-convex problems.",
+    "title": "Quickly Finding a Benign Region via Heavy Ball Momentum in Non-Convex Optimization",
+    "authors": [
+      "Jun-Kun Wang",
+      "Jacob Abernethy"
+    ],
+    "emails": [
+      "~Jun-Kun_Wang1",
+      "~Jacob_Abernethy1"
+    ],
+    "rank": 1192
+  },
+  {
+    "url": "https://openreview.net/forum?id=VbCVU10R7K",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "The presence of uncertainty in policy evaluation  significantly complicates the process of policy ranking and selection in real-world settings. We formally consider offline policy selection as learning preferences over a set of policy prospects given a fixed experience dataset. While one can select or rank policies based on point estimates of their policy values or high-confidence intervals, access to the full distribution over one's belief of the policy value enables more flexible selection algorithms under a wider range of downstream evaluation metrics. We propose BayesDICE for estimating this belief distribution in terms of posteriors of distribution correction ratios derived from stochastic constraints (as opposed to explicit likelihood, which is not available). Empirically, BayesDICE is highly competitive to existing state-of-the-art approaches in confidence interval estimation. More importantly, we show how the belief distribution estimated by BayesDICE may be used to rank policies with respect to any arbitrary downstream policy selection metric, and we empirically demonstrate that this selection procedure significantly outperforms existing approaches, such as ranking policies according to mean or high-confidence lower bound value estimates.",
+    "title": "Offline policy selection under Uncertainty",
+    "authors": [
+      "Mengjiao Yang",
+      "Bo Dai",
+      "Ofir Nachum",
+      "George Tucker",
+      "Dale Schuurmans"
+    ],
+    "emails": [
+      "~George_Tucker1",
+      "~Ofir_Nachum1",
+      "~Bo_Dai1",
+      "~Dale_Schuurmans1",
+      "~Mengjiao_Yang1"
+    ],
+    "rank": 1193
+  },
+  {
+    "url": "https://openreview.net/forum?id=b7g3_ZMHnT0",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Knowledge graphs (KGs) have helped neural models improve performance on various knowledge-intensive tasks, like question answering and item recommendation. By using attention over the KG, such KG-augmented models can also \"explain\" which KG information was most relevant for making a given prediction. In this paper, we question whether these models are really behaving as we expect. We show that, through a reinforcement learning policy (or even simple heuristics), one can produce deceptively perturbed KGs, which maintain the downstream performance of the original KG while significantly deviating from the original KG's semantics and structure. Our findings raise doubts about KG-augmented models' ability to reason about KG information and give sensible explanations.",
+    "title": "Learning to Deceive Knowledge Graph Augmented Models via Targeted Perturbation",
+    "authors": [
+      "Mrigank Raman",
+      "Aaron Chan",
+      "Siddhant Agarwal",
+      "PeiFeng Wang",
+      "Hansen Wang",
+      "Sungchul Kim",
+      "Ryan Rossi",
+      "Handong Zhao",
+      "Nedim Lipka",
+      "Xiang Ren"
+    ],
+    "emails": [
+      "~Aaron_Chan1",
+      "~Siddhant_Agarwal1",
+      "gmail.com",
+      "~PeiFeng_Wang1",
+      "~Handong_Zhao3",
+      "adobe.com",
+      "~Hansen_Wang1",
+      "~Xiang_Ren1",
+      "~Ryan_Rossi1"
+    ],
+    "rank": 1194
+  },
+  {
+    "url": "https://openreview.net/forum?id=CLnj31GZ4cI",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We study the problem of injecting knowledge into large pre-trained models like BERT and RoBERTa. Existing methods typically update the original parameters of pre-trained models when injecting knowledge. However, when multiple kinds of knowledge are injected, they may suffer from catastrophic forgetting.  To address this, we propose K-Adapter, which remains the original parameters of the pre-trained model fixed and supports continual knowledge infusion. Taking RoBERTa as the pre-trained model, K-Adapter has a neural adapter for each kind of infused knowledge, like a plug-in connected to RoBERTa. There is no information flow between different adapters, thus different adapters are efficiently trained in a distributed way. We inject two kinds of knowledge, including factual knowledge obtained from automatically aligned text-triplets on Wikipedia and Wikidata, and linguistic knowledge obtained from dependency parsing. Results on three knowledge-driven tasks (total six datasets) including relation classification, entity typing and question answering demonstrate that each adapter improves the performance, and the combination of both adapters brings further improvements. Probing experiments further indicate that K-Adapter captures richer factual and commonsense knowledge than RoBERTa.",
+    "title": "K-Adapter: Infusing Knowledge into Pre-Trained Models with Adapters",
+    "authors": [
+      "Ruize Wang",
+      "Duyu Tang",
+      "Nan Duan",
+      "zhongyu wei",
+      "Xuanjing Huang",
+      "Jianshu Ji",
+      "Guihong Cao",
+      "Daxin Jiang",
+      "Ming Zhou"
+    ],
+    "emails": [
+      "~zhongyu_wei1",
+      "~Duyu_Tang1",
+      "~Xuanjing_Huang1",
+      "~Ruize_Wang1",
+      "microsoft.com",
+      "~Ming_Zhou1",
+      "~Nan_Duan1"
+    ],
+    "rank": 1195
+  },
+  {
+    "url": "https://openreview.net/forum?id=KG4igOosnw8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "The use of episodic memories in continual learning has been shown to be effective in terms of alleviating catastrophic forgetting. In recent studies, several gradient-based approaches have been developed to make more efficient use of compact episodic memories, which constrain the gradients resulting from new samples with those from memorized samples, aiming to reduce the diversity of gradients from different tasks.  In this paper, we reveal the relation between diversity of gradients and discriminativeness of representations, demonstrating connections between Deep Metric Learning and continual learning. Based on these findings, we propose a simple yet efficient method -- Discriminative Representation Loss (DRL) -- for continual learning. In comparison with several state-of-the-art methods, this method shows effectiveness with  low computational cost on multiple benchmark experiments in the setting of online continual learning.",
+    "title": "Discriminative Representation Loss (DRL): A More Efficient Approach than Gradient Re-Projection in Continual Learning",
+    "authors": [
+      "Yu Chen",
+      "Tom Diethe",
+      "Peter Flach"
+    ],
+    "emails": [
+      "~Yu_Chen10",
+      "~Tom_Diethe1",
+      "~Peter_Flach1"
+    ],
+    "rank": 1196
+  },
+  {
+    "url": "https://openreview.net/forum?id=BUPIRa1D2J",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.64",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Capsule Networks (CapsNets) have shown to be a promising alternative to Convolutional Neural Networks (CNNs) in many computer vision tasks, due to their ability to encode object viewpoint variations. The high computational complexity and numerical instability of iterative routing mechanisms stem from the challenging nature of the part-object encoding process. This hinders CapsNets from being utilized effectively in large-scale image tasks. In this paper, we propose a novel non-iterative routing strategy named self-attention routing (SAR) that computes the agreement between the capsules in one forward pass. SAR accomplishes this by utilizing a learnable inducing mixture of Gaussians (IMoG) to reduce the cost of computing pairwise attention values from quadratic to linear time complexity. Our observations show that our Transformer Capsule Network (Trans-Caps) is better suited for complex image tasks including CIFAR-10/100, Tiny-ImageNet, and ImageNet when compared to other prominent CapsNet architectures. We also show that Trans-Caps yields a dramatic improvement over its competitors when presented with novel viewpoints on the SmallNORB dataset, outperforming EM-Caps by 5.77% and 3.25% on the novel azimuth and elevation experiments, respectively. Our observations suggest that our routing mechanism is able to capture complex part-whole relationships which allow Trans-Caps to construct reliable geometrical representations of the objects.",
+    "title": "Trans-Caps: Transformer Capsule Networks with Self-attention Routing",
+    "authors": [
+      "Aryan Mobiny",
+      "Pietro Antonio Cicalese",
+      "Hien Van Nguyen"
+    ],
+    "emails": [
+      "~Hien_Van_Nguyen1",
+      "~Pietro_Antonio_Cicalese1",
+      "~Aryan_Mobiny1"
+    ],
+    "rank": 1197
+  },
+  {
+    "url": "https://openreview.net/forum?id=fV4vvs1J5iM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      4,
+      2,
+      2,
+      3
+    ],
+    "abstract": "Many applications of reinforcement learning (RL) optimize a long-term reward subject to risk, safety, budget, diversity or other constraints. Though constrained RL problem has been studied to incorporate various constraints, existing methods either tie to specific families of RL algorithms or require storing infinitely many individual policies found by an RL oracle to approach a feasible solution. In this paper, we present a novel reduction approach for constrained RL problem that ensures convergence when using any off-the-shelf RL algorithm to construct an RL oracle yet requires storing at most constantly many policies. The key idea is to reduce the constrained RL problem to a distance minimization problem, and a novel variant of Frank-Wolfe algorithm is proposed for this task. Throughout the learning process, our method maintains at most constantly many individual policies, where the constant is shown to be worst-case optimal to ensure convergence of any RL oracle. Our method comes with rigorous convergence and complexity analysis, and does not introduce any extra hyper-parameter. Experiments on a grid-world navigation task demonstrate the efficiency of our method.\n",
+    "title": "A Reduction Approach to Constrained Reinforcement Learning",
+    "authors": [
+      "Tianchi Cai",
+      "Wenjie Shi",
+      "Lihong Gu",
+      "Xiaodong Zeng",
+      "Jinjie Gu"
+    ],
+    "emails": [
+      "~Wenjie_Shi1",
+      "antgroup.com"
+    ],
+    "rank": 1198
+  },
+  {
+    "url": "https://openreview.net/forum?id=Fblk4_Fd7ao",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.64",
+    "confidences": [
+      3,
+      1,
+      4,
+      3
+    ],
+    "abstract": "Effective communication is an important skill for enabling information exchange and cooperation in multi-agent settings. Indeed, emergent communication is now a vibrant field of research, with common settings involving discrete cheap-talk channels. One limitation of this setting is that it does not allow for the emergent protocols to generalize beyond the training partners. \n Furthermore, so far emergent communication has primarily focused on the use of symbolic channels. In this work, we extend this line of work to a new modality, by studying agents that learn to communicate via actuating their joints in a 3D environment. We show that under realistic assumptions, a non-uniform distribution of intents and a common-knowledge energy cost, these agents can find protocols that generalize to novel partners. We also explore and analyze specific difficulties associated with finding these solutions in practice. Finally, we propose and evaluate initial training improvements to address these challenges, involving both specific training curricula and providing the latent feature that can be coordinated on during training.",
+    "title": "Exploring Zero-Shot Emergent Communication in Embodied Multi-Agent Populations",
+    "authors": [
+      "Kalesha Bullard",
+      "Franziska Meier",
+      "Douwe Kiela",
+      "Joelle Pineau",
+      "Jakob Nicolaus Foerster"
+    ],
+    "emails": [
+      "~Jakob_Nicolaus_Foerster1",
+      "~Franziska_Meier2",
+      "~Joelle_Pineau1",
+      "~Kalesha_Bullard1",
+      "~Douwe_Kiela1"
+    ],
+    "rank": 1199
+  },
+  {
+    "url": "https://openreview.net/forum?id=D_I6trPKwlt",
+    "ratings": [
+      7,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.62",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We consider the problem of learning compositional hierarchies of graphs. Even though structural characteristics of graphs can be learned by Graph Neural Networks (GNNs), it is difficult to find an overall compositional hierarchy using such flat operators.\nIn this paper, we propose a new graph pooling algorithm, Spectrally Similar Graph Pooling (SSGPool), to learn hierarchical representations of graphs. The main idea of the proposed SSGPool algorithm is to learn a coarsening matrix which maps nodes from an original graph to a smaller number of nodes in a coarsened graph. The coarsening matrix is trained to coarsen the nodes based on their feature vectors while keeping the spectral characteristics of the original graph in the coarsened one. Although existing graph pooling methods take either feature-based pooling or structure-preserving pooling, SSGPool considers two properties simultaneously in an end-to-end manner. Experiments on various graph benchmarks show the advantage of our method compared to strong baselines. To further investigate the effectiveness of our proposed method, we evaluate our approach on a real-world problem, image retrieval with visual scene graphs. Quantitative and qualitative analyses on the retrieval problem confirm that the proposed method efficiently captures the hierarchical semantic structure of scene graphs.",
+    "title": "Spectrally Similar Graph Pooling",
+    "authors": [
+      "Kyoung-Woon On",
+      "Eun-Sol Kim",
+      "Il-Jae Kwon",
+      "Sangwoong Yoon",
+      "Byoung-Tak Zhang"
+    ],
+    "emails": [
+      "~Byoung-Tak_Zhang1",
+      "~Il-Jae_Kwon1",
+      "~Eun-Sol_Kim1",
+      "~Sangwoong_Yoon1",
+      "~Kyoung-Woon_On1"
+    ],
+    "rank": 1200
+  },
+  {
+    "url": "https://openreview.net/forum?id=RHY_9ZVcTa_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.62",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Identifiability is a desirable property of a statistical model: it implies that the true model parameters may be estimated to any desired precision, given sufficient computational resources and data. We study identifiability in the context of representation learning: discovering nonlinear data representations that are optimal with respect to some downstream task. When parameterized as deep neural networks, such representation functions lack identifiability in parameter space, because they are overparameterized by design. In this paper, building on recent advances in nonlinear Independent Components Analysis, we aim to rehabilitate identifiability by showing that a large family of discriminative models are in fact identifiable in function space, up to a linear indeterminacy. Many models for representation learning in a wide variety of domains have been identifiable in this sense, including text, images and audio, state-of-the-art at time of publication. We derive sufficient conditions for linear identifiability and provide empirical support for the result on both simulated and real-world data.",
+    "title": "On Linear Identifiability of Learned Representations",
+    "authors": [
+      "Geoffrey Roeder",
+      "Luke Metz",
+      "Diederik P Kingma"
+    ],
+    "emails": [
+      "~Geoffrey_Roeder1",
+      "~Diederik_P_Kingma1",
+      "~Luke_Metz1"
+    ],
+    "rank": 1201
+  },
+  {
+    "url": "https://openreview.net/forum?id=UiLl8yjh57",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      7,
+      3
+    ],
+    "rating": "5.62",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we investigate the problem of scheduling and resource allocation over a time varying set of clients with heterogeneous demands. This problem appears when service providers need to serve traffic generated by users with different classes of requirements. We thus have to allocate bandwidth resources over time to efficiently satisfy these demands within a limited time horizon. This is a highly intricate problem and solutions may involve tools stemming from diverse fields like combinatorics and optimization. Recent work has successfully proposed Deep Reinforcement Learning (DRL) solutions, although not yet for&nbsp;heterogeneous user traffic. We propose a deep deterministic policy gradient algorithm combining state of the art techniques, namely Distributional RL and Deep Sets, to train a model for heterogeneous traffic scheduling. We test on diverse number scenarios with different time dependence dynamics, users\u2019 requirements, and resources available, demonstrating consistent results. We evaluate the algorithm on a wireless communication setting and show significant gains against state-of-the-art conventional algorithms from combinatorics and optimization (e.g. Knapsack, Integer Linear Programming, Frank-Wolfe).",
+    "title": "Deep Reinforcement Learning For Wireless Scheduling with Multiclass Services",
+    "authors": [
+      "Apostolos Avranas",
+      "Marios Kountouris",
+      "Philippe Ciblat"
+    ],
+    "emails": [
+      "eurecom.fr",
+      "~Philippe_Ciblat1",
+      "~Apostolos_Avranas1"
+    ],
+    "rank": 1202
+  },
+  {
+    "url": "https://openreview.net/forum?id=1yXhko8GZEE",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      7
+    ],
+    "rating": "5.62",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "One of the major challenges when training generative adversarial nets (GANs) is instability.  To address this instability spectral normalization (SN) is remarkably successful.  However, SN-GAN still suffers from training instabilities, especially when working with higher-dimensional data.  We find that those instabilities are accompanied by large condition numbers of the discriminator weight matrices. To improve training stability we study common linear-algebra practice and employ preconditioning.   Specifically,  we introduce a  preconditioning layer  (PC-layer)that performs a  low-degree polynomial preconditioning.   We use this  PC-layer in two ways:  1)  fixed preconditioning  (FPC)  adds a  fixed  PC-layer to all layers, and 2)  adaptive preconditioning  (APC)  adaptively controls the strength of preconditioning. Empirically, we show that FPC and APC stabilize the training of un-conditional GANs using classical architectures.  On LSUN256\u00d7256 data, APC improves FID scores by around 5 points over baselines.",
+    "title": "Precondition Layer and Its Use for GANs",
+    "authors": [
+      "Tiantian Fang",
+      "Alex Schwing",
+      "Ruoyu Sun"
+    ],
+    "emails": [
+      "~Ruoyu_Sun1",
+      "~Alex_Schwing1",
+      "~Tiantian_Fang1"
+    ],
+    "rank": 1203
+  },
+  {
+    "url": "https://openreview.net/forum?id=x6x7FWFNZpg",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.62",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The ability to scale distributed optimization to large node counts has been one of the main enablers of recent progress in machine learning. To this end, several techniques have been explored, such as  asynchronous, quantized and decentralized communication--which significantly reduce the impact of communication and synchronization, as well as the ability for nodes to perform several local model updates before communicating--which reduces the frequency of communication. \nIn this paper, we show that these techniques, which have so far  largely been considered independently, can be jointly leveraged to minimize distribution cost for training neural network models via stochastic gradient descent (SGD). \nWe consider a setting with minimal coordination: we have a large number of nodes on a communication graph, each with a local subset of data, performing independent SGD updates onto their local models. After some number of local updates, each node chooses an interaction partner uniformly at random from its neighbors, and averages a (possibly quantized) version of its local model with the neighbor's model. \nOur first contribution is in proving that, even under such a relaxed setting, SGD can still be guaranteed to converge under standard assumptions. The proof is based on a new connection with parallel load-balancing processes, and improves existing techniques by handling decentralization, asynchrony, quantization, and local updates, into a single framework, and bounding their impact. \nOn the practical side, we implement variants of our algorithm and deploy them onto distributed environments, and show that they can successfully converge and scale for large-scale neural network training tasks, matching or even slightly improving the accuracy of previous methods. ",
+    "title": "Decentralized SGD with Asynchronous, Local and Quantized Updates",
+    "authors": [
+      "Giorgi Nadiradze",
+      "Amirmojtaba Sabour",
+      "Peter Davies",
+      "Ilia Markov",
+      "Shigang Li",
+      "Dan Alistarh"
+    ],
+    "emails": [
+      "~Giorgi_Nadiradze1",
+      "~Dan_Alistarh7",
+      "gmail.com",
+      "ist.ac.at"
+    ],
+    "rank": 1204
+  },
+  {
+    "url": "https://openreview.net/forum?id=uys9OcmXNtU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.62",
+    "confidences": [
+      2,
+      3,
+      3
+    ],
+    "abstract": "Recent advances in neural forecasting have produced major improvements in accuracy for probabilistic demand prediction. In this work, we propose novel improvements to the current state of the art by incorporating changes inspired by recent advances in Transformer architectures for Natural Language Processing. We develop a novel decoder-encoder attention for context-alignment, improving forecasting accuracy by allowing the network to study its own history based on the context for which it is producing a forecast. We also present a novel positional encoding that allows the neural network to learn context-dependent seasonality functions as well as arbitrary holiday distances. Finally we show that the current state of the art MQ-Forecaster (Wen et al., 2017) models display excess variability by failing to leverage previous errors in the forecast to improve accuracy. We propose a novel decoder-self attention scheme for forecasting that produces significant improvements in the excess variation of the forecast.",
+    "title": "MQTransformer: Multi-Horizon Forecasts with Context Dependent and Feedback-Aware Attention",
+    "authors": [
+      "Carson Eisenach",
+      "Yagna Patel",
+      "Dhruv Madeka"
+    ],
+    "emails": [
+      "~Dhruv_Madeka1",
+      "~Yagna_Patel1",
+      "~Carson_Eisenach1"
+    ],
+    "rank": 1205
+  },
+  {
+    "url": "https://openreview.net/forum?id=yvuk0RsLoP7",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      7,
+      4
+    ],
+    "rating": "5.62",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "We propose a novel adversarial training method which leverages both the local and global information to defend  adversarial attacks. Existing adversarial training methods usually generate adversarial perturbations  locally in a supervised manner and fail to consider the data manifold information in a global way. Consequently, the resulting  adversarial examples may corrupt the underlying data structure and are typically biased towards the decision boundary. In this work, we exploit both the local and global information of data manifold to generate adversarial examples in an unsupervised manner. Specifically, we design our novel framework  via an adversarial game between  a discriminator and a classifier: the discriminator is  learned to differentiate the latent distributions  of the natural data and the perturbed counterpart, while the classifier is trained to recognize accurately the perturbed examples as well as enforcing the invariance between the two latent distributions. We conduct a series of analysis on the model robustness and also verify the effectiveness of our proposed method empirically. Experimental results show that our method  substantially outperforms the recent state-of-the-art (i.e. Feature Scattering) in defending adversarial attacks  by a large accuracy margin  (e.g. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>17.0</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>18.1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on SVHN dataset, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>9.3</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>17.4</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on CIFAR-10 dataset, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>6.0</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>16.2</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on CIFAR-100 dataset for defending PGD20 and CW20 attacks respectively).",
+    "title": "Improving Model Robustness with Latent Distribution Locally and Globally",
+    "authors": [
+      "Zhuang QIAN",
+      "Shufei Zhang",
+      "Kaizhu Huang",
+      "Qiufeng Wang",
+      "Rui Zhang",
+      "Xinping Yi"
+    ],
+    "emails": [
+      "~Zhuang_QIAN1",
+      "~Kaizhu_Huang1",
+      "~Qiufeng_Wang2",
+      "~Shufei_Zhang1",
+      "~Rui_Zhang10",
+      "liverpool.ac.uk"
+    ],
+    "rank": 1206
+  },
+  {
+    "url": "https://openreview.net/forum?id=wxRwhSdORKG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      6,
+      7
+    ],
+    "rating": "5.62",
+    "confidences": [
+      5,
+      2,
+      3,
+      3
+    ],
+    "abstract": "In goal-conditioned Hierarchical Reinforcement Learning (HRL), a high-level policy periodically sets subgoals for a low-level policy, and the low-level policy is trained to reach those subgoals. A proper subgoal representation function, which abstracts a state space to a latent subgoal space, is crucial for effective goal-conditioned HRL, since different low-level behaviors are induced by reaching subgoals in the compressed representation space. Observing that the high-level agent operates at an abstract temporal scale, we propose a slowness objective to effectively learn the subgoal representation (i.e., the high-level action space). We provide a theoretical grounding for the slowness objective. That is, selecting slow features as the subgoal space can achieve efficient hierarchical exploration. As a result of better exploration ability, our approach significantly outperforms state-of-the-art HRL and exploration methods on a number of benchmark continuous-control tasks. Thanks to the generality of the proposed subgoal representation learning method, empirical results also demonstrate that the learned representation and corresponding low-level policies can be transferred between distinct tasks.",
+    "title": "Learning Subgoal Representations with Slow Dynamics",
+    "authors": [
+      "Siyuan Li",
+      "Lulu Zheng",
+      "Jianhao Wang",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "mails.tsinghua.edu.cn",
+      "~Chongjie_Zhang1",
+      "~Siyuan_Li1"
+    ],
+    "rank": 1207
+  },
+  {
+    "url": "https://openreview.net/forum?id=1NRMmEUyXMu",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.62",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Planning, the ability to analyze the structure of a problem in the large and decompose it into interrelated subproblems, is a hallmark of human intelligence. While deep reinforcement learning (RL) has shown great promise for solving relatively straightforward control tasks, it remains an open problem how to best incorporate planning into existing deep RL paradigms to handle increasingly complex environments. One prominent framework, Model-Based RL, learns a world model and plans using step-by-step virtual rollouts. This type of world model quickly diverges from reality when the planning horizon increases, thus struggling at long-horizon planning. How can we learn world models that endow agents with the ability to do temporally extended reasoning? In this work, we propose to learn graph-structured world models composed of sparse, multi-step transitions. We devise a novel algorithm to learn latent landmarks that are scattered (in terms of reachability) across the goal space as the nodes on the graph. In this same graph, the edges are the reachability estimates distilled from Q-functions. On a variety of high-dimensional continuous control tasks ranging from robotic manipulation to navigation, we demonstrate that our method, named L^{3}P, significantly outperforms prior work, and is oftentimes the only method capable of leveraging both the robustness of model-free RL and generalization of graph-search algorithms. We believe our work is an important step towards scalable planning in reinforcement learning. ",
+    "title": "World Model as a Graph: Learning Latent Landmarks for Planning",
+    "authors": [
+      "Lunjun Zhang",
+      "Ge Yang",
+      "Bradly C Stadie"
+    ],
+    "emails": [
+      "~Bradly_C_Stadie1",
+      "~Lunjun_Zhang1",
+      "~Ge_Yang1"
+    ],
+    "rank": 1208
+  },
+  {
+    "url": "https://openreview.net/forum?id=TlS3LBoDj3Z",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.62",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "QTRAN is a multi-agent reinforcement learning (MARL) algorithm capable of learning the largest class of joint-action value functions up to date. However, despite its strong theoretical guarantee, it has shown poor empirical performance in complex environments, such as Starcraft Multi-Agent Challenge (SMAC). In this paper, we identify the performance bottleneck of QTRAN and propose a substantially improved version, coined QTRAN++. Our gains come from (i) stabilizing the training objective of QTRAN, (ii) removing the strict role separation between the action-value estimators of QTRAN, and (iii) introducing a multi-head mixing network for value transformation. Through extensive evaluation, we confirm that our diagnosis is correct, and QTRAN++ successfully bridges the gap between empirical performance and theoretical guarantee. In particular, QTRAN++ newly achieves state-of-the-art performance in the SMAC environment. The code will be released.",
+    "title": "QTRAN++: Improved Value Transformation for Cooperative Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Kyunghwan Son",
+      "Sungsoo Ahn",
+      "Roben D. Delos Reyes",
+      "Jinwoo Shin",
+      "Yung Yi"
+    ],
+    "emails": [
+      "~Kyunghwan_Son1",
+      "kaist.ac.kr",
+      "~Jinwoo_Shin1",
+      "~Sungsoo_Ahn1",
+      "~Yung_Yi1"
+    ],
+    "rank": 1209
+  },
+  {
+    "url": "https://openreview.net/forum?id=c5QbJ1zob73",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      5,
+      8,
+      6
+    ],
+    "rating": "5.62",
+    "confidences": [
+      4,
+      4,
+      2,
+      2,
+      4
+    ],
+    "abstract": "We propose a novel theoretical framework to understand self-supervised learning methods that employ dual pairs of deep ReLU networks (e.g., SimCLR, BYOL). First, we prove that in each SGD update of SimCLR, the weights at each layer are updated by a \\emph{covariance operator} that specifically amplifies initial random selectivities that vary across data samples but survive averages over data augmentations. We show this leads to the emergence of hierarchical features, if the input data are generated from a hierarchical latent tree model. With the same framework, we also show analytically that in BYOL, the combination of BatchNorm and a predictor network creates an implicit contrastive term, acting as an approximate covariance operator. Additionally, for linear architectures we derive exact solutions for BYOL that provide conceptual insights into how BYOL can learn useful non-collapsed representations without any contrastive terms that separate negative pairs.  Extensive ablation studies justify our theoretical findings. ",
+    "title": "Understanding Self-supervised Learning with Dual Deep Networks",
+    "authors": [
+      "Yuandong Tian",
+      "Lantao Yu",
+      "Xinlei Chen",
+      "Surya Ganguli"
+    ],
+    "emails": [
+      "~Lantao_Yu2",
+      "~Surya_Ganguli1",
+      "~Yuandong_Tian1",
+      "~Xinlei_Chen1"
+    ],
+    "rank": 1210
+  },
+  {
+    "url": "https://openreview.net/forum?id=pAq1h9sQhqd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.62",
+    "confidences": [
+      4,
+      4,
+      2,
+      3
+    ],
+    "abstract": " We present an efficient stochastic algorithm (RSG+) for canonical correlation analysis (CCA) derived via a differential geometric perspective of the underlying optimization task. We show that exploiting the Riemannian structure of the problem reveals natural strategies for modified forms of manifold stochastic gradient descent schemes that have been variously used in the literature for numerical optimization on manifolds. Our developments complement existing methods for this problem which either require <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>d</mi><mn>3</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> time complexity per iteration with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msqrt size=\"s\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.189em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><msqrt><mi>t</mi></msqrt></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> convergence rate (where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> is the dimensionality) or only extract the top <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn></math></mjx-assistive-mml></mjx-container> component with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><mi>t</mi></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> convergence rate. In contrast, our algorithm achieves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>d</mi><mn>2</mn></msup><mi>k</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> runtime complexity per iteration for extracting top <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> canonical components with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><mi>t</mi></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> convergence rate. We present our theoretical analysis as well as experiments describing the empirical behavior of our algorithm, including a potential application of this idea for training fair models where the label of protected attribute is missing or otherwise unavailable.",
+    "title": "Stochastic Canonical Correlation Analysis: A Riemannian Approach",
+    "authors": [
+      "Zihang Meng",
+      "Rudrasis Chakraborty",
+      "Vikas Singh"
+    ],
+    "emails": [
+      "~Vikas_Singh1",
+      "~Zihang_Meng1",
+      "~Rudrasis_Chakraborty1"
+    ],
+    "rank": 1211
+  },
+  {
+    "url": "https://openreview.net/forum?id=uUTx2LOBMV",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.62",
+    "confidences": [
+      2,
+      4,
+      5,
+      2
+    ],
+    "abstract": "As a novel model that bridges machine learning and quantum theory, tensor network (TN) has recently gained increasing attention and successful applications for processing natural images. However, for natural languages, it is unclear how to design a probabilistic encoding  architecture to efficiently and accurately learn and classify texts based on TN. This paper proposes a general two-step scheme of text classification based on Tensor Network, which is named as TextTN. TextTN first encodes the word vectors in a probabilistic space by a generative TN (word-GTN), and then classifies a text sentence using a discriminative TN (sentence-DTN).  Moreover, in sentence-DTN, its hyper-parameter (i.e., bond-dimension) can be analyzed and selected by the theoretical property of TextTN's expressive power. In experiments, our TextTN also obtains the state-of-the-art result on SST-5 sentiment classification task.",
+    "title": "TextTN: Probabilistic Encoding of Language on Tensor Network",
+    "authors": [
+      "Peng Zhang",
+      "Jing Zhang",
+      "Xindian Ma",
+      "Siwei Rao",
+      "Guangjian Tian",
+      "Jun Wang"
+    ],
+    "emails": [
+      "~Xindian_Ma1",
+      "~Siwei_Rao2",
+      "huawei.com",
+      "~Jun_Wang2",
+      "~Peng_Zhang17",
+      "~Jing_Zhang20"
+    ],
+    "rank": 1212
+  },
+  {
+    "url": "https://openreview.net/forum?id=5i4vRgoZauw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      8,
+      6
+    ],
+    "rating": "5.62",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "After training on large datasets, certain deep neural networks are surprisingly good models of the neural mechanisms of adult primate visual object recognition. Nevertheless, these models are poor models of the development of the visual system because they posit millions of sequential, precisely coordinated synaptic updates, each based on a labeled image. While ongoing research is pursuing the use of unsupervised proxies for labels, we here explore a complementary strategy of reducing the required number of supervised synaptic updates to produce an adult-like ventral visual stream (as judged by the match to V1, V2, V4, IT, and behavior). Such models might require less precise machinery and energy expenditure to coordinate these updates and would thus move us closer to viable neuroscientific hypotheses about how the visual system wires itself up. Relative to the current leading model of the adult ventral stream, we here demonstrate that the total number of supervised weight updates can be substantially reduced using three complementary strategies: First, we find that only 2% of supervised updates (epochs and images) are needed to achieve ~80% of the match to adult ventral stream. Second, by improving the random distribution of synaptic connectivity, we find that 54% of the brain match can already be achieved \u201cat birth\" (i.e. no training at all). Third, we find that, by training only ~5% of model synapses, we can still achieve nearly 80% of the match to the ventral stream. When these three strategies are applied in combination, we find that these new models achieve ~80% of a fully trained model's match to the brain, while using two orders of magnitude fewer supervised synaptic updates. These results reflect first steps in modeling not just primate adult visual processing during inference, but also how the ventral visual stream might be \"wired up\" by evolution (a model's \"birth\" state) and by developmental learning (a model's updates based on visual experience).",
+    "title": "Wiring Up Vision: Minimizing Supervised Synaptic Updates Needed to Produce a Primate Ventral Stream",
+    "authors": [
+      "Franziska Geiger",
+      "Martin Schrimpf",
+      "Tiago Marques",
+      "James J. DiCarlo"
+    ],
+    "emails": [
+      "~Martin_Schrimpf1",
+      "~James_J._DiCarlo1",
+      "googlemail.com",
+      "mit.edu"
+    ],
+    "rank": 1213
+  },
+  {
+    "url": "https://openreview.net/forum?id=aYJr_Rt30p",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4
+    ],
+    "rating": "5.62",
+    "confidences": [
+      5,
+      3,
+      5
+    ],
+    "abstract": "Colours can be represented in an infinite set of spaces highlighting distinct features. Here, we investigated the impact of colour spaces on the encoding capacity of a visual system that is subject to information compression, specifically variational autoencoders (VAEs) where bottlenecks are imposed. To this end, we propose a novel unsupervised task: colour space conversion (ColourConvNets). We trained several instances of VAEs whose input and output are in different colour spaces, e.g. from RGB to CIE L*a*b* (in total five colour spaces were examined). This allowed us to systematically study the influence of input-output colour spaces on the encoding efficiency and learnt representation. Our evaluations demonstrate that ColourConvNets with decorrelated output colour spaces produce higher quality images, also evident in pixel-wise low-level metrics such as colour difference (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c394\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u0394</mi><mi>E</mi></math></mjx-assistive-mml></mjx-container>), peak signal-to-noise ratio (PSNR) and structural similarity index measure (SSIM). We also assessed the ColourConvNets' capacity to reconstruct the global content in two downstream tasks: image classification (ImageNet) and scene segmentation (COCO). Our results show a 5-10% performance boost for decorrelating ColourConvNets with respect to the baseline network (whose input and output are RGB). Furthermore, we thoroughly analysed the finite embedding space of Vector Quantised VAEs with three different methods (single feature, hue shift and linear transformation). The interpretations reached with these techniques are in agreement suggesting that (i) luminance and chromatic information are encoded in separate embedding vectors, and (ii) the structure of the network's embedding space is determined by the output colour space.",
+    "title": "Learning Representation in Colour Conversion",
+    "authors": [
+      "Arash Akbarinia",
+      "Raquel Gil-Rodriguez",
+      "Alban Flachot",
+      "Matteo Toscani"
+    ],
+    "emails": [
+      "psychol.uni-giessen.de",
+      "~Arash_Akbarinia1"
+    ],
+    "rank": 1214
+  },
+  {
+    "url": "https://openreview.net/forum?id=pOHW7EwFbo9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6
+    ],
+    "rating": "5.62",
+    "confidences": [
+      3,
+      3,
+      2
+    ],
+    "abstract": "Many real-world problems require that reinforcement learning (RL) agents learn policies that not only maximize a scalar reward, but do so while meeting constraints, such as remaining below an energy consumption threshold. Typical approaches for solving constrained RL problems rely on Lagrangian relaxation, but these suffer from several limitations. We draw a connection between multi-objective RL and constrained RL, based on the key insight that the constraint-satisfying optimal policy must be Pareto optimal. This leads to a novel, multi-objective perspective for constrained RL. We propose a framework that uses a multi-objective RL algorithm to find a Pareto front of policies that trades off between the reward and constraint(s), and simultaneously searches along this front for constraint-satisfying policies. We show that in practice, an instantiation of our framework outperforms existing approaches on several challenging continuous control domains, both in terms of solution quality and sample efficiency, and enables flexibility in recovering a portion of the Pareto front rather than a single constraint-satisfying policy.",
+    "title": "Explicit Pareto Front Optimization for Constrained Reinforcement Learning",
+    "authors": [
+      "Sandy Huang",
+      "Abbas Abdolmaleki",
+      "Philemon Brakel",
+      "Steven Bohez",
+      "Nicolas Heess",
+      "Martin Riedmiller",
+      "raia hadsell"
+    ],
+    "emails": [
+      "~Abbas_Abdolmaleki3",
+      "~Sandy_Huang1",
+      "~Nicolas_Heess1",
+      "~Martin_Riedmiller1",
+      "~Philemon_Brakel1",
+      "~Steven_Bohez1",
+      "~raia_hadsell1"
+    ],
+    "rank": 1215
+  },
+  {
+    "url": "https://openreview.net/forum?id=oVz-YWdiMjt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.62",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "The established approach to unsupervised protein contact prediction estimates coevolving positions using undirected graphical models. This approach trains a Potts model on a Multiple Sequence Alignment, then predicts that the edges with highest weight correspond to contacts in the 3D structure. On the other hand, increasingly large Transformers are being pretrained on protein sequence databases but have demonstrated mixed results for downstream tasks, including contact prediction. This has sparked discussion about the role of scale and attention-based models in unsupervised protein representation learning. We argue that attention is a principled model of protein interactions, grounded in real properties of protein family data. We introduce a simplified attention layer, factored attention, and show that it achieves comparable performance to Potts models, while sharing parameters both within and across families. Further, we extract contacts from the attention maps of a pretrained Transformer and show they perform competitively with the other two approaches. This provides evidence that large-scale pretraining can learn meaningful protein features when presented with unlabeled and unaligned data. We contrast factored attention with the Transformer to indicate that the Transformer leverages hierarchical signal in protein family databases not captured by our single-layer models. This raises the exciting possibility for the development of powerful structured models of protein family databases.",
+    "title": "Single Layers of Attention Suffice to Predict Protein Contacts",
+    "authors": [
+      "Nick Bhattacharya",
+      "Neil Thomas",
+      "Roshan Rao",
+      "Justas Daupras",
+      "Peter K Koo",
+      "David Baker",
+      "Yun S. Song",
+      "Sergey Ovchinnikov"
+    ],
+    "emails": [
+      "g.harvard.edu",
+      "~David_Baker1",
+      "~Peter_K_Koo1",
+      "~Nick_Bhattacharya1",
+      "~Yun_S._Song1",
+      "berkeley.edu",
+      "uw.edu"
+    ],
+    "rank": 1216
+  },
+  {
+    "url": "https://openreview.net/forum?id=XtPeiGx6BwC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      7,
+      6
+    ],
+    "rating": "5.62",
+    "confidences": [
+      5,
+      5,
+      4,
+      2
+    ],
+    "abstract": "Test-time adversarial attacks have posed serious challenges to the robustness of machine-learning models, and in many settings the adversarial perturbation need not be bounded by small lp-norms.  Motivated by the semantics-preserving attacks in vision and security domain, we investigate relational adversaries, a broad class of attackers who create adversarial examples that are in a re\ufb02exive-transitive closure of a logical relation. We analyze the conditions for robustness and propose normalize-and-predict \u2013 a learning framework with provable robustness guarantee. We compare our approach with adversarial training and derive an uni\ufb01ed framework that provides bene\ufb01ts of both approaches. Guided by our theoretical \ufb01ndings, we apply our framework to malware detection and image classi\ufb01cation. Results of both tasks show that attacks using relational adversaries frequently fool existing models, but our uni\ufb01ed framework can signi\ufb01cantly enhance their robustness.\n",
+    "title": "Robustness against Relational Adversary",
+    "authors": [
+      "Yizhen Wang",
+      "Xiaozhu Meng",
+      "Mihai Christodorescu",
+      "Somesh Jha",
+      "Ke Wang"
+    ],
+    "emails": [
+      "visa.com",
+      "rice.edu",
+      "~Somesh_Jha1",
+      "~Ke_Wang1",
+      "~Yizhen_Wang1"
+    ],
+    "rank": 1217
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z2qyx5vC8Xn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.62",
+    "confidences": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "abstract": "An effective approach to exploration in reinforcement learning is to rely on an agent's uncertainty over the optimal policy, which can yield near-optimal exploration strategies in tabular settings. However, in non-tabular settings that involve function approximators, obtaining accurate uncertainty estimates is almost as challenging as the exploration problem itself. In this paper, we highlight that value estimates are easily biased and temporally inconsistent. In light of this, we propose a novel method for estimating uncertainty over the value function that relies on inducing a distribution over temporal difference errors. This exploration signal controls for state-action transitions so as to isolate uncertainty in value that is due to uncertainty over the agent's parameters. Because our measure of uncertainty conditions on state-action transitions, we cannot act on this measure directly. Instead, we incorporate it as an intrinsic reward and treat exploration as a separate learning problem, induced by the agent's temporal difference uncertainties. We introduce a distinct exploration policy that learns to collect data with high estimated uncertainty, which gives rise to a curriculum that smoothly changes throughout learning and vanishes in the limit of perfect value estimates. We evaluate our method on hard exploration tasks, including Deep Sea and Atari 2600 environments and find that our proposed form of exploration facilitates efficient exploration.",
+    "title": "Temporal Difference Uncertainties as a Signal for Exploration",
+    "authors": [
+      "Sebastian Flennerhag",
+      "Jane X Wang",
+      "Pablo Sprechmann",
+      "Francesco Visin",
+      "Alexandre Galashov",
+      "Steven Kapturowski",
+      "Diana L Borsa",
+      "Nicolas Heess",
+      "Andre Barreto",
+      "Razvan Pascanu"
+    ],
+    "emails": [
+      "~Jane_X_Wang1",
+      "~Diana_L_Borsa1",
+      "~Francesco_Visin1",
+      "~Steven_Kapturowski1",
+      "~Razvan_Pascanu1",
+      "~Alexandre_Galashov1",
+      "~Pablo_Sprechmann1",
+      "~Nicolas_Heess1",
+      "~Sebastian_Flennerhag1",
+      "~Andre_Barreto1"
+    ],
+    "rank": 1218
+  },
+  {
+    "url": "https://openreview.net/forum?id=8VXvj1QNRl1",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      2,
+      7,
+      9
+    ],
+    "rating": "5.61",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Learning meaningful representations that disentangle the underlying structure of the data generating process is considered to be of key importance in machine learning. While disentangled representations were found to be useful for diverse tasks such as abstract reasoning and fair classification, their scalability and real-world impact remain questionable.\nWe introduce a new high-resolution dataset with 1M simulated images and over 1,800 annotated real-world images of the same setup. In contrast to previous work, this new dataset exhibits correlations, a complex underlying structure, and allows to evaluate transfer to unseen simulated and real-world settings where the encoder i) remains in distribution or ii) is out of distribution.\nWe propose new architectures in order to scale disentangled representation learning to realistic high-resolution settings and conduct a large-scale empirical study of disentangled representations on this dataset. We observe that disentanglement is a good predictor for out-of-distribution (OOD) task performance.",
+    "title": "On the Transfer of Disentangled Representations in Realistic Settings",
+    "authors": [
+      "Andrea Dittadi",
+      "Frederik Tr\u00e4uble",
+      "Francesco Locatello",
+      "Manuel Wuthrich",
+      "Vaibhav Agrawal",
+      "Ole Winther",
+      "Stefan Bauer",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Manuel_Wuthrich1",
+      "~Frederik_Tr\u00e4uble1",
+      "~Andrea_Dittadi1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Stefan_Bauer1",
+      "~Vaibhav_Agrawal1",
+      "~Francesco_Locatello1",
+      "~Ole_Winther1"
+    ],
+    "rank": 1219
+  },
+  {
+    "url": "https://openreview.net/forum?id=WrNjg9tCLUt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Recently, federated learning (FL) has been subject to both security and privacy attacks posing a dilemmatic challenge on the underlying algorithmic designs: On the one hand, FL is shown to be vulnerable to backdoor attacks that stealthily manipulate the global model output using malicious model updates, and on the other hand, FL is shown vulnerable to inference attacks by a malicious aggregator inferring information about clients\u2019 data from their model updates. Unfortunately, existing defenses against these attacks are insufficient and mitigating both attacks at the same time is highly challenging, because while defeating backdoor attacks requires the analysis of model updates, protection against inference attacks prohibits access to the model updates to avoid information leakage. In this work, we introduce BAFFLE, a novel in-depth defense for FL that tackles this challenge. To mitigate backdoor attacks, it applies a multilayered defense by using a Model Filtering layer to detect and reject malicious model updates and a Poison Elimination layer to eliminate any effect of a remaining undetected weak manipulation. To impede inference attacks, we build private BAFFLE that securely evaluates the BAFFLE algorithm under encryption using sophisticated secure computation techniques. We extensively evaluate BAFFLE against state-of-the-art backdoor attacks on several datasets and applications, including image classification, word prediction, and IoT intrusion. We show that BAFFLE can entirely remove backdoors with a negligible effect on accuracy and that private BAFFLE is practical.",
+    "title": "BAFFLE: TOWARDS RESOLVING FEDERATED LEARNING\u2019S DILEMMA - THWARTING BACKDOOR  AND INFERENCE ATTACKS",
+    "authors": [
+      "Thien Duc Nguyen",
+      "Phillip Rieger",
+      "Hossein Yalame",
+      "Helen M\u00f6llering",
+      "Hossein Fereidooni",
+      "Samuel Marchal",
+      "Markus Miettinen",
+      "Azalia Mirhoseini",
+      "Ahmad-Reza Sadeghi",
+      "Thomas Schneider",
+      "Shaza Zeitouni"
+    ],
+    "emails": [
+      "~Azalia_Mirhoseini1",
+      "trust.tu-darmstadt.de",
+      "encrypto.cs.tu-darmstadt.de",
+      "aalto.fi"
+    ],
+    "rank": 1220
+  },
+  {
+    "url": "https://openreview.net/forum?id=8cpHIfgY4Dj",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      5,
+      7
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We study the offline meta-reinforcement learning (OMRL) problem, a paradigm which enables reinforcement learning (RL) algorithms to quickly adapt to unseen tasks without any interactions with the environments, making RL truly practical in many real-world applications. This problem is still not fully understood, for which two major challenges need to be addressed. First, offline RL usually suffers from bootstrapping errors of out-of-distribution state-actions which leads to divergence of value functions. Second, meta-RL requires efficient and robust task inference learned jointly with control policy. In this work, we enforce behavior regularization on learned policy as a general approach to offline RL, combined with a deterministic context encoder for efficient task inference. We propose a novel negative-power distance metric on bounded context embedding space, whose gradients propagation is detached from the Bellman backup. We provide analysis and insight showing that some simple design choices can yield substantial improvements over recent approaches involving meta-RL and distance metric learning. To the best of our knowledge, our method is the first model-free and end-to-end OMRL algorithm, which is computationally efficient and demonstrated to outperform prior algorithms on several meta-RL benchmarks.",
+    "title": "FOCAL: Efficient Fully-Offline Meta-Reinforcement Learning via Distance Metric Learning and Behavior Regularization",
+    "authors": [
+      "Lanqing Li",
+      "Rui Yang",
+      "Dijun Luo"
+    ],
+    "emails": [
+      "~Lanqing_Li1",
+      "~Dijun_Luo1",
+      "mails.tsinghua.edu.cn"
+    ],
+    "rank": 1221
+  },
+  {
+    "url": "https://openreview.net/forum?id=IfEkus1dpU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      2,
+      4
+    ],
+    "abstract": "Cut-and-paste methods take an object from one image and insert it into another.   Doing so often results in unrealistic looking images because the inserted object's shading is inconsistent with the target scene's shading. Existing reshading methods require a geometric and physical model of the inserted object, which is then rendered using environment parameters. Accurately constructing such a model only from a single image is beyond the current understanding of computer vision.\n\nWe describe an alternative procedure -- cut-and-paste neural rendering, to render the inserted fragment's shading field consistent with the target scene.  We use a Deep Image Prior (DIP) as a neural renderer trained to render an image with consistent image decomposition inferences.  The resulting rendering from DIP should have an albedo consistent with composite albedo; it should have a shading field that, outside the inserted fragment, is the same as the target scene's shading field; \nand composite surface normals are consistent with the final rendering's shading field. \nThe result is a simple procedure that produces convincing and realistic shading. Moreover, our procedure does not require rendered images or image-decomposition from real images in the training or labeled annotations. In fact, our only use of simulated ground truth is our use of a pre-trained normal estimator. Qualitative results are strong, supported by a user study comparing against state-of-the-art image harmonization baseline.",
+    "title": "Cut-and-Paste Neural Rendering",
+    "authors": [
+      "Anand Bhattad",
+      "David Forsyth"
+    ],
+    "emails": [
+      "~David_Forsyth1",
+      "~Anand_Bhattad1"
+    ],
+    "rank": 1222
+  },
+  {
+    "url": "https://openreview.net/forum?id=vCEhC7nOb6",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      7
+    ],
+    "rating": "5.60",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We analyze the inductive bias of gradient descent for weight normalized smooth homogeneous neural nets, when trained on exponential or cross-entropy loss. Our analysis focuses on exponential weight normalization (EWN), which encourages weight updates along the radial direction. This paper shows that the gradient flow path with EWN is equivalent to gradient flow on standard networks with an adaptive learning rate, and hence causes the weights to be updated in a way that prefers asymptotic relative sparsity. These results can be extended to hold for gradient descent via an appropriate adaptive learning rate. The asymptotic convergence rate of the loss in this setting is given by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c398\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mrow size=\"s\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-mrow></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u0398</mi><mo stretchy=\"false\">(</mo><mfrac><mn>1</mn><mrow><mi>t</mi><mo stretchy=\"false\">(</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>t</mi><msup><mo stretchy=\"false\">)</mo><mn>2</mn></msup></mrow></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, and is independent of the depth of the network. We contrast these results with the inductive bias of standard weight normalization (SWN) and unnormalized architectures, and demonstrate their implications on synthetic data sets.Experimental results on simple data sets and architectures support our claim on sparse EWN solutions, even with SGD. This demonstrates its potential applications in learning prunable neural networks.",
+    "title": "Inductive Bias of Gradient Descent for Exponentially Weight Normalized Smooth Homogeneous Neural Nets",
+    "authors": [
+      "Depen Morwani",
+      "Harish Guruprasad Ramaswamy"
+    ],
+    "emails": [
+      "~Harish_Guruprasad_Ramaswamy1",
+      "~Depen_Morwani1"
+    ],
+    "rank": 1223
+  },
+  {
+    "url": "https://openreview.net/forum?id=cAvgPMAA3hb",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We present a simple yet powerful implicit neural function that can represent and render arbitrarily complex 3D scenes in a single network only from 2D observations. The function models 3D scenes as a general radiance field, which takes a set of 2D images with camera poses and intrinsics as input, constructs an internal representation for each 3D point of the scene, and renders the corresponding appearance and geometry of any 3D point viewing from an arbitrary angle. The key to our approach is to explicitly integrate the principle of multi-view geometry to obtain the internal representations from observed 2D views, such that the learned implicit representations empirically remain multi-view consistent. In addition, we introduce an effective neural module to learn general features for each pixel in 2D images, allowing the constructed internal 3D representations to be general as well. Extensive experiments demonstrate the superiority of our approach.",
+    "title": "GRF: Learning a General Radiance Field for 3D Scene Representation and Rendering",
+    "authors": [
+      "Alex Trevithick",
+      "Bo Yang"
+    ],
+    "emails": [
+      "williams.edu",
+      "~Bo_Yang7"
+    ],
+    "rank": 1224
+  },
+  {
+    "url": "https://openreview.net/forum?id=NlrFDOgRRH",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      8,
+      4
+    ],
+    "rating": "5.60",
+    "confidences": [
+      3,
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Despite recent progress in memory augmented neural network research, associative memory networks with a single external memory still show limited performance on complex relational reasoning tasks. The main reason for this problem comes from the lossy representation of a content-based addressing memory and its insufficient associating performance for long temporal sequence data. To address these problems, here we introduce a novel Distributed Associative Memory architecture (DAM) with Association Reinforcing Loss (ARL) function which enhances the relation reasoning performance of memory augmented neural network. In this framework, instead of relying on a single large external memory, we form a set of multiple smaller associative memory blocks and update these sub-memory blocks simultaneously and independently with the content-based addressing mechanism. Based on DAM architecture, we can effectively retrieve complex relational information by integrating diverse representations distributed across multiple sub-memory blocks with an attention mechanism. Moreover, to further enhance the relation modeling performance of memory network, we propose ARL which assists a task's target objective while learning relational information exist in data. ARL enables the memory augmented neural network to reinforce an association between input data and task objective by reproducing stochastically sampled input data from stored memory contents. With this content reproducing task, it enriches the representations with relational information. In experiments, we apply our two main approaches to Differential Neural Computer (DNC), which is one of the representative content-based addressing memory model and achieves state-of-the-art performance on both memorization and relational reasoning tasks.",
+    "title": "Distributed Associative Memory Network with Association Reinforcing Loss",
+    "authors": [
+      "Taewon Park",
+      "Inchul Choi",
+      "Minho Lee"
+    ],
+    "emails": [
+      "~Inchul_Choi1",
+      "gmail.com",
+      "~Taewon_Park1"
+    ],
+    "rank": 1225
+  },
+  {
+    "url": "https://openreview.net/forum?id=0fqoSxXBwI6",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      5,
+      4,
+      2
+    ],
+    "abstract": "3D object representation learning is a fundamental challenge in computer vision to draw inferences about the 3D world. Recent advances in deep learning have shown their efficiency in 3D object recognition, among which view-based methods have performed best so far. However, feature learning of multiple views in existing methods is mostly trained in a supervised fashion, which often requires a large amount of data labels with high cost. Hence, it is critical to learn multi-view feature representations in a self-supervised fashion. To this end, we propose a novel self-supervised learning paradigm of Multi-View Transformation Equivariant Representations (MV-TER), exploiting the equivariant transformations of a 3D object and its projected multiple views. Specifically, we perform a 3D transformation on a 3D object, and obtain multiple views before and after transformation via projection. Then, we self-train a representation learning module to capture the intrinsic 3D object representation by decoding 3D transformation parameters from the fused feature representations of multiple views before and after transformation. Experimental results demonstrate that the proposed MV-TER significantly outperforms the state-of-the-art view-based approaches in 3D object classification and retrieval tasks.",
+    "title": "Self-Supervised Multi-View Learning via Auto-Encoding 3D Transformations",
+    "authors": [
+      "Xiang Gao",
+      "Wei Hu",
+      "Guo-Jun Qi"
+    ],
+    "emails": [
+      "~Guo-Jun_Qi1",
+      "~Wei_Hu6",
+      "~Xiang_Gao2"
+    ],
+    "rank": 1226
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rcmk0xxIQV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.60",
+    "confidences": [
+      2,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We explore value-based multi-agent reinforcement learning (MARL) in the popular paradigm of centralized training with decentralized execution (CTDE). CTDE has an important concept, Individual-Global-Max (IGM) principle, which requires the consistency between joint and local action selections to support efficient local decision-making. However, in order to achieve scalability, existing MARL methods either limit representation expressiveness of their value function classes or relax the IGM consistency, which may suffer from instability risk or may not perform well in complex domains. This paper presents a novel MARL approach, called duPLEX dueling multi-agent Q-learning (QPLEX), which takes a duplex dueling network architecture to factorize the joint value function. This duplex dueling structure encodes the IGM principle into the neural network architecture and thus enables efficient value function learning. Theoretical analysis shows that QPLEX achieves a complete IGM function class. Empirical experiments on StarCraft II micromanagement tasks demonstrate that QPLEX significantly outperforms state-of-the-art baselines in both online and offline data collection settings, and also reveal that QPLEX achieves high sample efficiency and can benefit from offline datasets without additional online exploration.",
+    "title": "QPLEX: Duplex Dueling Multi-Agent Q-Learning",
+    "authors": [
+      "Jianhao Wang",
+      "Zhizhou Ren",
+      "Terry Liu",
+      "Yang Yu",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "~Yang_Yu5",
+      "~Zhizhou_Ren1",
+      "~Chongjie_Zhang1",
+      "~Terry_Liu2",
+      "~Jianhao_Wang1"
+    ],
+    "rank": 1227
+  },
+  {
+    "url": "https://openreview.net/forum?id=7nfCtKep-v",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Natural language processing (NLP) tasks, ranging from text classification to text\ngeneration, have been revolutionised by pretrained BERT models. This allows\ncorporations to easily build powerful APIs by encapsulating fine-tuned BERT\nmodels. These BERT-based APIs are often designed to not only provide reliable\nservice but also protect intellectual properties or privacy-sensitive information of\nthe training data. However, a series of privacy and robustness issues may still exist\nwhen a fine-tuned BERT model is deployed as a service. In this work, we first\npresent an effective model extraction attack, where the adversary can practically\nsteal a BERT-based API (the target/victim model). We then demonstrate: (1)\nhow the extracted model can be further exploited to develop effective attribute\ninference attack to expose sensitive information of the training data of the victim\nmodel; (2) how the extracted model can lead to highly transferable adversarial\nattacks against the victim model. Extensive experiments on multiple benchmark\ndatasets under various realistic settings validate the potential privacy and adversarial\nvulnerabilities of BERT-based APIs.",
+    "title": "EXPLORING VULNERABILITIES OF BERT-BASED APIS",
+    "authors": [
+      "Xuanli He",
+      "Lingjuan Lyu",
+      "Lichao Sun",
+      "Xiaojun Chang",
+      "Jun Zhao"
+    ],
+    "emails": [
+      "~Xiaojun_Chang3",
+      "~Lingjuan_Lyu1",
+      "~Lichao_Sun1",
+      "~Xuanli_He2",
+      "~Jun_Zhao1"
+    ],
+    "rank": 1228
+  },
+  {
+    "url": "https://openreview.net/forum?id=UwOMufsTqCy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Rule-based models, e.g., decision trees, are widely used in scenarios demanding high model interpretability for their transparent inner structures and good model expressivity. However, rule-based models are hard to optimize, especially on large data sets, due to their discrete parameters and structures. Ensemble methods and fuzzy/soft rules are commonly used to tackle these issues, but they sacrifice the model interpretability. In this paper, we propose a new classifier, named Rule-based Representation Learner (RRL), that automatically learns interpretable non-fuzzy rules for data representation. To train the non-differentiable RRL effectively, we project it to a continuous space and propose a novel training method, called Gradient Grafting, that can directly optimize the discrete model using gradient descent. An improved design of logical activation functions is also devised to increase the scalability of RRL and enable it to discretize the continuous features end-to-end. Exhaustive experiments on 9 small and 4 large data sets show that RRL outperforms the competitive approaches, has low complexity close to the simple decision trees, and is rational for its main technical contributions.",
+    "title": "RRL: A Scalable Classifier for Interpretable Rule-Based Representation Learning",
+    "authors": [
+      "Zhuo Wang",
+      "Wei Zhang",
+      "Ning Liu",
+      "Jianyong Wang"
+    ],
+    "emails": [
+      "~Jianyong_Wang2",
+      "~Ning_Liu3",
+      "~Zhuo_Wang2",
+      "~Wei_Zhang27"
+    ],
+    "rank": 1229
+  },
+  {
+    "url": "https://openreview.net/forum?id=8YFhXYe1Ps",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Current state of the art computer vision applications rely on highly complex models. Their interpretability is mostly limited to post-hoc methods which are not guaranteed to be faithful to the model. To elucidate a model\u2019s decision, we present a novel interpretable model based on an invertible deep convolutional network. Our model generates meaningful, faithful, and ideal counterfactuals. Using PCA on the classifier\u2019s input, we can also create \u201cisofactuals\u201d\u2013 image interpolations with the same outcome but visually meaningful different features. Counter- and isofactuals can be used to identify positive and negative evidence in an image. This can also be visualized with heatmaps. We evaluate our approach against gradient-based attribution methods, which we find to produce meaningless adversarial perturbations.  Using our method, we reveal biases in three different datasets. In a human subject experiment, we test whether non-experts find our method useful to spot spurious correlations learned by a model. Our work is a step towards more trustworthy explanations for computer vision.",
+    "title": "Interpretability Through Invertibility: A Deep Convolutional Network With Ideal Counterfactuals And Isosurfaces",
+    "authors": [
+      "Leon Sixt",
+      "Martin Schuessler",
+      "Philipp Wei\u00df",
+      "Tim Landgraf"
+    ],
+    "emails": [
+      "itp.tu-berlin.de",
+      "tu-berlin.de",
+      "~Tim_Landgraf1",
+      "~Leon_Sixt1"
+    ],
+    "rank": 1230
+  },
+  {
+    "url": "https://openreview.net/forum?id=NcFEZOi-rLa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      8,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.60",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Contrasting the previous evidence that neurons in the later layers of a Convolutional Neural Network (CNN) respond to complex object shapes, recent studies have shown that CNNs actually exhibit a 'texture bias': given an image with both texture and shape cues (e.g., a stylized image), a CNN is biased towards predicting the category corresponding to the texture. However, these previous studies conduct experiments on the final classification output of the network, and fail to robustly evaluate the bias contained (i) in the latent representations, and (ii) on a per-pixel level. In this paper, we design a series of experiments that overcome these issues. We do this with the goal of better understanding what type of shape information contained in the network is discriminative, where shape information is encoded, as well as when the network learns about object shape during training. We show that a network learns the majority of overall shape information at the first few epochs of training and that this information is largely encoded in the last few layers of a CNN. Finally, we show that the encoding of shape does not imply the encoding of localized per-pixel semantic information. The experimental results and findings provide a more accurate understanding of the behaviour of current CNNs, thus helping to inform future design choices.",
+    "title": "Shape or Texture: Understanding Discriminative Features in CNNs",
+    "authors": [
+      "Md Amirul Islam",
+      "Matthew Kowal",
+      "Patrick Esser",
+      "Sen Jia",
+      "Bj\u00f6rn Ommer",
+      "Konstantinos G. Derpanis",
+      "Neil Bruce"
+    ],
+    "emails": [
+      "~Matthew_Kowal1",
+      "~Neil_Bruce1",
+      "~Patrick_Esser1",
+      "~Konstantinos_G._Derpanis1",
+      "~Md_Amirul_Islam1",
+      "~Sen_Jia1",
+      "~Bj\u00f6rn_Ommer2"
+    ],
+    "rank": 1231
+  },
+  {
+    "url": "https://openreview.net/forum?id=w_BtePbtmx4",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Training Deep Neural Networks (DNNs) places immense compute requirements on the underlying hardware platforms, expending large amounts of time and energy. We proposeLoCal+SGD, a new algorithmic approach to accelerate DNN train-ing by selectively combining localized or Hebbian learning within a StochasticGradient Descent (SGD) based training framework. Back-propagation is a computationally expensive process that requires 2 Generalized Matrix Multiply (GEMM)operations to compute the error and weight gradients for each layer. We alleviate this by selectively updating some layers\u2019 weights using localized learning rules that require only 1 GEMM operation per layer. Further, since the weight update is performed during the forward pass itself, the layer activations for the mini-batch do not need to be stored until the backward pass, resulting in a reduced memory footprint. Localized updates can substantially boost training speed, but need to be used selectively and judiciously in order to preserve accuracy and convergence. We address this challenge through the design of a Learning Mode Selection Algorithm, where all layers start with SGD, and as epochs progress, layers gradually transition to localized learning. Specifically, for each epoch, the algorithm identifies a Localized\u2192SGDtransition layer, which delineates the network into two regions. Layers before the transition layer use localized updates, while the transition layer and later layers use gradient-based updates. The trend in the weight updates made to the transition layer across epochs is used to determine how the boundary betweenSGD and localized updates is shifted in future epochs. We also propose a low-cost weak supervision mechanism by controlling the learning rate of localized updates based on the overall training loss. We applied LoCal+SGDto 8 image recognition CNNs (including ResNet50 and MobileNetV2) across 3 datasets (Cifar10, Cifar100and ImageNet). Our measurements on a Nvidia GTX 1080Ti GPU demonstrate upto 1.5\u00d7improvement in end-to-end training time with\u223c0.5% loss in Top-1classification accuracy.",
+    "title": "Accelerating DNN Training through Selective Localized Learning ",
+    "authors": [
+      "Sarada Krithivasan",
+      "Sanchari Sen",
+      "Swagath Venkataramani",
+      "Anand Raghunathan"
+    ],
+    "emails": [
+      "~Sanchari_Sen1",
+      "us.ibm.com",
+      "~Sarada_Krithivasan1",
+      "~Anand_Raghunathan1"
+    ],
+    "rank": 1232
+  },
+  {
+    "url": "https://openreview.net/forum?id=wC99I7uIFe",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.60",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we propose the discrete Gaussian based differentially private federated learning (D2p-fed), a unified scheme to achieve both differential privacy (DP) and communication efficiency in federated learning (FL). In particular, compared with the only prior work taking care of both aspects, D2p-fed provides stronger privacy guarantee, better composability and smaller communication cost. The key idea is to apply the discrete Gaussian noise to the private data transmission. We provide complete analysis of the privacy guarantee, communication cost and convergence rate of D2p-fed. We evaluated D2p-fed on INFIMNIST and CIFAR10. The results show that D2p-fed outperforms the-state-of-the-art by 4.7% to 13.0% in terms of model accuracy while saving one third of the communication cost. The code for evaluation is available in the supplementary material.",
+    "title": "D2p-fed:Differentially Private Federated Learning with Efficient Communication",
+    "authors": [
+      "Lun Wang",
+      "Ruoxi Jia",
+      "Dawn Song"
+    ],
+    "emails": [
+      "vt.edu",
+      "~Dawn_Song1",
+      "~Lun_Wang1"
+    ],
+    "rank": 1233
+  },
+  {
+    "url": "https://openreview.net/forum?id=LuyryrCs6Ez",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.60",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Humans can learn and reason under substantial uncertainty in a space of infinitely many concepts, including structured relational concepts (\u201ca scene with objects that have the same color\u201d) and ad-hoc categories defined through goals (\u201cobjects that could fall on one\u2019s head\u201d). In contrast, standard classification benchmarks: 1) consider only a fixed set of category labels, 2) do not evaluate compositional concept learning and 3) do not explicitly capture a notion of reasoning under uncertainty. We introduce a new few-shot, meta-learning benchmark, Compositional Reasoning Under Uncertainty (CURI) to bridge this gap. CURI evaluates different aspects of productive and systematic generalization, including abstract understandings of disentangling, productive generalization, learning boolean operations, variable binding, etc. Importantly, it also defines a model-independent \u201ccompositionality gap\u201d to evaluate difficulty of generalizing out-of-distribution along each of these axes. Extensive evaluations across a range of modeling choices spanning different modalities (image, schemas, and sounds), splits, privileged auxiliary concept information, and choices of negatives reveal substantial scope for modeling advances on the proposed task. All code and datasets will be available online.",
+    "title": "CURI: A Benchmark for Productive Concept Learning Under Uncertainty",
+    "authors": [
+      "Shanmukha Ramakrishna Vedantam",
+      "Arthur Szlam",
+      "Maximilian Nickel",
+      "Ari S. Morcos",
+      "Brenden M. Lake"
+    ],
+    "emails": [
+      "~Maximilian_Nickel1",
+      "~Shanmukha_Ramakrishna_Vedantam1",
+      "~Brenden_M._Lake1",
+      "~Arthur_Szlam1",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 1234
+  },
+  {
+    "url": "https://openreview.net/forum?id=QB7FkNVAfxa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      9,
+      6
+    ],
+    "rating": "5.59",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Neural networks trained via gradient descent with random initialization and without any regularization enjoy good generalization performance in practice despite being highly overparametrized. A promising direction to explain this phenomenon is the \\emph{Neural Tangent Kernel} (NTK), which characterizes the implicit regularization effect of gradient flow/descent on infinitely wide neural networks with random initialization. However, a non-asymptotic analysis that connects generalization performance, initialization, and optimization for finite width networks remains elusive. In this paper, we present a novel analysis of overparametrized single-hidden layer linear networks, which formally connects initialization, optimization, and overparametrization with generalization performance. We exploit the fact that gradient flow preserves a certain matrix that characterizes the \\emph{imbalance} of the network weights, to show that the squared loss converges exponentially at a rate that depends on the level of imbalance of the initialization. Such guarantees on the convergence rate allow us to show that large hidden layer width, together with (properly scaled) random initialization, implicitly constrains the dynamics of the network parameters to be close to a low-dimensional manifold. In turn,  minimizing the loss over this manifold leads to solutions with good generalization, which correspond to the min-norm solution in the linear case. Finally, we derive a novel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>h</mi><mrow><mo>\u2212</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> upper-bound on the operator norm distance between the trained network and the min-norm solution, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>h</mi></math></mjx-assistive-mml></mjx-container> is the hidden layer width. ",
+    "title": "On the Explicit Role of Initialization on the Convergence and Generalization Properties of Overparametrized Linear Networks",
+    "authors": [
+      "Hancheng Min",
+      "Salma Tarmoun",
+      "Rene Vidal",
+      "Enrique Mallada"
+    ],
+    "emails": [
+      "~Salma_Tarmoun1",
+      "~Rene_Vidal1",
+      "~Enrique_Mallada1",
+      "~Hancheng_Min1"
+    ],
+    "rank": 1235
+  },
+  {
+    "url": "https://openreview.net/forum?id=DFIoGDZejIB",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      4,
+      5
+    ],
+    "rating": "5.59",
+    "confidences": [
+      3,
+      5,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Much recent work has focused on how an agent can learn what to do from human feedback, leading to two major paradigms. The first paradigm is reward learning, in which the agent learns a reward model through human feedback that is provided externally from the environment. The second is assistance, in which the human is modeled as a part of the environment, and the true reward function is modeled as a latent variable in the environment that the agent may make inferences about. The key difference between the two paradigms is that in the reward learning paradigm, by construction there is a separation between reward learning and control using the learned reward. In contrast, in assistance these functions are performed as needed by a single policy. By merging reward learning and control, assistive agents can reason about the impact of control actions on reward learning, leading to several advantages over agents based on reward learning. We illustrate these advantages in simple environments by showing desirable qualitative behaviors of assistive agents that cannot be found by agents based on reward learning.",
+    "title": "Benefits of Assistance over Reward Learning",
+    "authors": [
+      "Rohin Shah",
+      "Pedro Freire",
+      "Neel Alex",
+      "Rachel Freedman",
+      "Dmitrii Krasheninnikov",
+      "Lawrence Chan",
+      "Michael D Dennis",
+      "Pieter Abbeel",
+      "Anca Dragan",
+      "Stuart Russell"
+    ],
+    "emails": [
+      "~Lawrence_Chan2",
+      "~Rachel_Freedman1",
+      "~Anca_Dragan1",
+      "~Neel_Alex1",
+      "~Michael_D_Dennis1",
+      "~Rohin_Shah1",
+      "~Pieter_Abbeel2",
+      "~Stuart_Russell1",
+      "~Dmitrii_Krasheninnikov1",
+      "~Pedro_Freire1"
+    ],
+    "rank": 1236
+  },
+  {
+    "url": "https://openreview.net/forum?id=HbZTcIuiMAG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      5,
+      7
+    ],
+    "rating": "5.58",
+    "confidences": [
+      4,
+      1,
+      3,
+      4
+    ],
+    "abstract": "Parametric computer-aided design (CAD) is a standard paradigm used for the design of manufactured objects. CAD designers perform modeling operations, such as sketch and extrude, to form a construction sequence that makes up a final design. Despite the pervasiveness of parametric CAD and growing interest from the research community, a dataset of human designed 3D CAD construction sequences has not been available to-date. In this paper we present the Fusion 360 Gallery reconstruction dataset and environment for learning CAD reconstruction. We provide a dataset of 8,625 designs, comprising sequential sketch and extrude modeling operations, together with a complementary environment called the Fusion 360 Gym, to assist with performing CAD reconstruction. We outline a standard CAD reconstruction task, together with evaluation metrics, and present results from a novel method using neurally guided search to recover a construction sequence from a target geometry.",
+    "title": "Fusion 360 Gallery: A Dataset and Environment for Programmatic CAD Reconstruction",
+    "authors": [
+      "Karl Willis",
+      "Yewen Pu",
+      "Jieliang Luo",
+      "Hang Chu",
+      "Tao Du",
+      "Joseph Lambourne",
+      "Armando Solar-Lezama",
+      "Wojciech Matusik"
+    ],
+    "emails": [
+      "~Hang_Chu4",
+      "~Armando_Solar-Lezama1",
+      "~Wojciech_Matusik2",
+      "~Karl_Willis1",
+      "~Yewen_Pu1",
+      "autodesk.com",
+      "~Jieliang_Luo1",
+      "~Tao_Du1"
+    ],
+    "rank": 1237
+  },
+  {
+    "url": "https://openreview.net/forum?id=yoVo1fThmS1",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.57",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We propose a new method for novelty detection that can tolerate high corruption of the training points, whereas previous works assumed either no or very low corruption. Our method trains a robust variational autoencoder (VAE), which aims to generate a model for the uncorrupted training points. To gain robustness to high corruption, we incorporate the following four changes to the common VAE: 1. Extracting crucial features of the latent code by a carefully designed dimension reduction component for distributions; 2. Modeling the latent distribution as a mixture of Gaussian low-rank inliers and full-rank outliers, where the testing only uses the inlier model; 3. Applying the Wasserstein-1 metric for regularization, instead of the Kullback-Leibler (KL) divergence; and 4. Using a least absolute deviation error for reconstruction. We establish both robustness to outliers and suitability to low-rank modeling of the Wasserstein metric as opposed to the KL divergence. We illustrate state-of-the-art results on standard benchmarks for novelty detection.",
+    "title": "Novelty Detection via Robust Variational Autoencoding",
+    "authors": [
+      "Chieh-Hsin Lai",
+      "Dongmian Zou",
+      "Gilad Lerman"
+    ],
+    "emails": [
+      "~Gilad_Lerman1",
+      "~Dongmian_Zou1",
+      "~Chieh-Hsin_Lai1"
+    ],
+    "rank": 1238
+  },
+  {
+    "url": "https://openreview.net/forum?id=a7gkBG1m6e",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.57",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "There is considerable evidence that deep neural networks are vulnerable to adversarial perturbations applied directly to their digital inputs. However, it remains an open question whether this translates to vulnerabilities in real-world systems. Specifically, in the context of image inputs to autonomous driving systems, an attack can be achieved only by modifying the physical environment, so as to ensure that the resulting stream of video inputs to the car's controller leads to incorrect driving decisions. Inducing this effect on the video inputs indirectly through the environment requires accounting for system dynamics and tracking viewpoint changes. We propose a scalable and efficient approach for finding adversarial physical modifications, using a differentiable approximation for the mapping from environmental modifications\u2014namely, rectangles drawn on the road\u2014to the corresponding video inputs to the controller network. Given the color, location, position, and orientation parameters of the rectangles, our mapping composites them onto pre-recorded video streams of the original environment. Our mapping accounts for geometric and color variations, is differentiable with respect to rectangle parameters, and uses multiple original video streams obtained by varying the driving trajectory. When combined with a neural network-based controller, our approach allows the design of adversarial modifications through end-to-end gradient-based optimization. We evaluate our approach using the Carla autonomous driving simulator, and show that it is significantly more scalable and far more effective at generating attacks than a prior black-box approach based on Bayesian Optimization.",
+    "title": "Finding Physical Adversarial Examples for Autonomous Driving with Fast and Differentiable Image Compositing",
+    "authors": [
+      "Jinghan Yang",
+      "Adith Boloor",
+      "Ayan Chakrabarti",
+      "Xuan Zhang",
+      "Yevgeniy Vorobeychik"
+    ],
+    "emails": [
+      "wustl.edu",
+      "~Yevgeniy_Vorobeychik1",
+      "~Xuan_Zhang1",
+      "~Ayan_Chakrabarti1"
+    ],
+    "rank": 1239
+  },
+  {
+    "url": "https://openreview.net/forum?id=XEyElxd9zji",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      7
+    ],
+    "rating": "5.57",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": " Brains learn robustly, and generalize effortlessly between different learning tasks; in contrast, robustness and generalization across tasks are well known weaknesses of artificial neural nets (ANNs).  How can we use our accelerating understanding of the brain to improve these and other aspects of ANNs? Here we hypothesize that (a) Brains employ synaptic plasticity rules that serve as proxies for GD; (b) These rules themselves can be learned by GD on the rule parameters; and (c) This process may be a missing ingredient for the development of ANNs that generalize well and are robust to adversarial perturbations. We provide both empirical and theoretical evidence for this hypothesis. In our experiments, plasticity rules for the synaptic weights of recurrent neural nets (RNNs) are learned through GD and are found to perform reasonably well (with no backpropagation).  We find that plasticity rules learned by this process generalize from one type of data/classifier to others (e.g., rules learned on synthetic data work well on MNIST/Fashion MNIST) and converge with fewer updates. Moreover, the classifiers learned using plasticity rules exhibit surprising levels of tolerance to adversarial perturbations. In the special case of the last layer of a classification network, we show analytically that GD on the plasticity rule recovers (and improves upon) the perceptron algorithm and the multiplicative weights method. Finally, we argue that applying GD to learning rules is biologically plausible, in the sense that it can be learned over evolutionary time: we describe a genetic setting where natural selection of a numerical parameter over a sequence of generations provably simulates a simple variant of GD.",
+    "title": "Learning with Plasticity Rules: Generalization and Robustness",
+    "authors": [
+      "Rares C Cristian",
+      "Max Dabagia",
+      "Christos Papadimitriou",
+      "Santosh Vempala"
+    ],
+    "emails": [
+      "~Rares_C_Cristian1",
+      "~Christos_Papadimitriou2",
+      "~Max_Dabagia1",
+      "~Santosh_Vempala1"
+    ],
+    "rank": 1240
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rhl8IoYzdSI",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      6,
+      4,
+      2
+    ],
+    "rating": "5.57",
+    "confidences": [
+      2,
+      4,
+      4,
+      2,
+      2
+    ],
+    "abstract": "Most predictive systems currently in use do not report any useful information for auditing their associated uncertainty and evaluating the corresponding risk. Taking it for granted that their replacement may not be advisable in the short term, in this paper we propose a novel approach to modelling confidence in such systems while preserving their predictions. The method is based on the Chebyshev Polynomial Approximation Network (the ChePAN), a new way of modelling aleatoric uncertainty in a regression scenario. In the case addressed here, uncertainty is modelled by building conditional quantiles on top of the original pointwise forecasting system considered as a black box, i.e. without making assumptions about its internal structure. Furthermore, the ChePAN allows users to consistently choose how to constrain any predicted quantile with respect to the original forecaster. Experiments show that the proposed method scales to large size data sets and transfers the advantages of quantile regression to estimating black-box uncertainty.",
+    "title": "ChePAN: Constrained Black-Box Uncertainty Modelling with Quantile Regression",
+    "authors": [
+      "Axel Brando",
+      "Joan Gimeno",
+      "Jose Antonio Rodriguez-Serrano",
+      "Jordi Vitria"
+    ],
+    "emails": [
+      "~Jordi_Vitria1",
+      "~Axel_Brando1",
+      "maia.ub.edu",
+      "bbvadata.com"
+    ],
+    "rank": 1241
+  },
+  {
+    "url": "https://openreview.net/forum?id=arNvQ7QRyVb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.57",
+    "confidences": [
+      2,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Effective lifelong learning across diverse tasks requires diverse knowledge, yet transferring irrelevant knowledge may lead to interference and catastrophic forgetting. In deep networks, transferring the appropriate granularity of knowledge is as important as the transfer mechanism, and must be driven by the relationships among tasks. We first show that the lifelong learning performance of several current deep learning architectures can be significantly improved by transfer at the appropriate layers. We then develop an expectation-maximization (EM) method to automatically select the appropriate transfer configuration and optimize the task network weights. This EM-based selective transfer is highly effective, as demonstrated on three algorithms in several lifelong object classification scenarios.",
+    "title": "Sharing Less is More: Lifelong Learning in Deep Networks with Selective Layer Transfer",
+    "authors": [
+      "Seungwon Lee",
+      "Sima Behpour",
+      "ERIC EATON"
+    ],
+    "emails": [
+      "~Seungwon_Lee2",
+      "~ERIC_EATON1",
+      "~Sima_Behpour1"
+    ],
+    "rank": 1242
+  },
+  {
+    "url": "https://openreview.net/forum?id=SQ7EHTDyn9Y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.57",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Optimization nondeterminism causes uncertainty when improving neural networks, with small changes in performance difficult to discern from run-to-run variability. While uncertainty can be reduced by training multiple copies of a model with different random seeds, doing so is time-consuming, costly, and makes reproducibility challenging. Despite this, little attention has been paid towards establishing an understanding of this problem. In this work, we establish an experimental protocol for understanding the effect of optimization nondeterminism on model diversity, which allows us to study the independent effects of a variety of sources of nondeterminism. Surprisingly, we find that each source of nondeterminism all have similar effects on multiple measures of model diversity. To explain this intriguing fact, we examine and identify the instability of model training, when taken as an end-to-end procedure, as the key determinant. We show that even one-bit changes in initial model parameters result in models that converge to vastly different values. Last, we demonstrate that recent methods in accelerated model ensembling hold promise for reducing the effects of instability on run-to-run variability.",
+    "title": "On Nondeterminism and Instability in Neural Network Optimization",
+    "authors": [
+      "Cecilia Summers",
+      "Michael J. Dinneen"
+    ],
+    "emails": [
+      "~Michael_J._Dinneen1",
+      "~Cecilia_Summers1"
+    ],
+    "rank": 1243
+  },
+  {
+    "url": "https://openreview.net/forum?id=RmcPm9m3tnk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.57",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Human perception excels at building compositional hierarchies of parts and objects from unlabeled scenes that help systematic generalization. Yet most work on generative scene modeling either ignores the part-whole relationship or assumes access to predefined part labels. In this paper, we propose Generative Scene Graph Networks (GSGNs), the first deep generative model that learns to discover the primitive parts and infer the part-whole relationship jointly from multi-object scenes without supervision and in an end-to-end trainable way. We formulate GSGN as a variational autoencoder in which the latent representation is a tree-structured probabilistic scene graph. The leaf nodes in the latent tree correspond to primitive parts, and the edges represent the symbolic pose variables required for recursively composing the parts into whole objects and then the full scene. This allows novel objects and scenes to be generated both by sampling from the prior and by manual configuration of the pose variables, as we do with graphics engines. We evaluate GSGN on datasets of scenes containing multiple compositional objects, including a challenging Compositional CLEVR dataset that we have developed. We show that GSGN is able to infer the latent scene graph, generalize out of the training regime, and improve data efficiency in downstream tasks.",
+    "title": "Generative Scene Graph Networks",
+    "authors": [
+      "Fei Deng",
+      "Zhuo Zhi",
+      "Donghun Lee",
+      "Sungjin Ahn"
+    ],
+    "emails": [
+      "etri.re.kr",
+      "~Fei_Deng1",
+      "~Zhuo_Zhi1",
+      "~Sungjin_Ahn1"
+    ],
+    "rank": 1244
+  },
+  {
+    "url": "https://openreview.net/forum?id=19drPzGV691",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.57",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We address the problem of learning a risk-sensitive policy based on the CVaR risk measure using distributional reinforcement learning. In particular, we show that applying the distributional Bellman optimality operator with respect to a risk-based action-selection strategy overestimates the dynamic, Markovian CVaR. The resulting policies can however still be overly conservative and one often prefers to learn an optimal policy based on the static, non-Markovian CVaR. To this end, we propose a modification to the existing algorithm and show that it can indeed learn a proper CVaR-optimized policy. Our proposed approach is a simple extension of standard distributional RL algorithms and can therefore take advantage of many of the recent advances in deep RL. On both synthetic and real data, we empirically show that our proposed algorithm is able to produce a family of risk-averse policies that achieves a better tradeoff between risk and the expected return.\n",
+    "title": "Distributional Reinforcement Learning for Risk-Sensitive Policies",
+    "authors": [
+      "Shiau Hong Lim",
+      "Ilyas Malik"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Shiau_Hong_Lim1"
+    ],
+    "rank": 1245
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oz_4sa7hKhl",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      6,
+      6
+    ],
+    "rating": "5.57",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In data-constrained cases, the common practice of fine-tuning BERT for a target text classification task is prone to producing poor performance. In such low resources scenarios, we suggest performing an unsupervised classification task prior to fine-tuning on the target task. \nSpecifically, as such an intermediate task, we perform unsupervised clustering, training BERT on predicting the cluster labels. We test this hypothesis on various data sets, and show that this additional classification step can reduce the demand for labeled examples.\nWe further discuss under which conditions this task is helpful and why. ",
+    "title": "Cluster & Tune: Enhance BERT Performance in Low Resource Text Classification",
+    "authors": [
+      "Eyal Shnarch",
+      "Ariel Gera",
+      "Alon Halfon",
+      "Lena Dankin",
+      "Leshem Choshen",
+      "Ranit Aharonov",
+      "Noam Slonim"
+    ],
+    "emails": [
+      "il.ibm.com",
+      "~Ranit_Aharonov2",
+      "~Eyal_Shnarch1",
+      "~Leshem_Choshen1"
+    ],
+    "rank": 1246
+  },
+  {
+    "url": "https://openreview.net/forum?id=GjqcL-v0J2A",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.57",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Jointly identifying a mixture of discrete and continuous factors of variability can help unravel complex phenomena. We study this problem by proposing an unsupervised framework called coupled mixture VAE (cpl-mixVAE), which utilizes multiple interacting autoencoding agents. The individual agents operate on augmented copies of training samples to learn mixture representations, while being encouraged to reach consensus on the categorical assignments. We provide theoretical justification to motivate the use of a multi-agent framework, and formulate it as a variational inference problem. We benchmark our approach on MNIST and dSprites, achieving state-of-the-art categorical assignments while preserving interpretability of the continuous factors. We then demonstrate the utility of this approach in jointly identifying cell types and type-specific, activity-regulated genes for a single-cell gene expression dataset profiling over 100 cortical neuron types.",
+    "title": "Mixture Representation Learning with Coupled Autoencoding Agents",
+    "authors": [
+      "Yeganeh Marghi",
+      "Rohan Gala",
+      "Uygar S\u00fcmb\u00fcl"
+    ],
+    "emails": [
+      "~Uygar_S\u00fcmb\u00fcl2",
+      "~Yeganeh_Marghi1",
+      "~Rohan_Gala1"
+    ],
+    "rank": 1247
+  },
+  {
+    "url": "https://openreview.net/forum?id=Iw4ZGwenbXf",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.56",
+    "confidences": [
+      2,
+      2,
+      3,
+      2
+    ],
+    "abstract": "In this work we propose the use of adaptive stochastic search as a building block for general, non-convex optimization operations within deep neural network architectures. Specifically, for an objective function located at some layer in the network and parameterized by some network parameters, we employ adaptive stochastic search to perform optimization over its output. This operation is differentiable and does not obstruct the passing of gradients during backpropagation, thus enabling us to incorporate it as a component in end-to-end learning. We study the proposed optimization module's properties and benchmark it against two existing alternatives on a synthetic energy-based structured prediction task, and further showcase its use in stochastic optimal control applications.",
+    "title": "NOVAS: Non-convex Optimization via Adaptive Stochastic Search for End-to-end Learning and Control",
+    "authors": [
+      "Ioannis Exarchos",
+      "Marcus Aloysius Pereira",
+      "Ziyi Wang",
+      "Evangelos Theodorou"
+    ],
+    "emails": [
+      "~Ziyi_Wang1",
+      "~Marcus_Aloysius_Pereira1",
+      "~Ioannis_Exarchos1",
+      "~Evangelos_Theodorou1"
+    ],
+    "rank": 1248
+  },
+  {
+    "url": "https://openreview.net/forum?id=1sJWR4y1lG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      6,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      2,
+      2,
+      1
+    ],
+    "abstract": "Recent works have connected deep learning and kernel methods. In this paper, we show that architectural choices such as convolutional layers with pooling, skip connections, make deep learning a composite kernel learning method, where the kernel is a (architecture dependent) composition of base kernels: even before training, standard deep networks have in-built structural properties that ensure their success. In particular, we build on the recently developed `neural path' framework that characterises the role of gates/masks in fully connected deep networks with ReLU activations. ",
+    "title": "Deep Learning Is Composite Kernel Learning",
+    "authors": [
+      "CHANDRA SHEKAR LAKSHMINARAYANAN",
+      "Amit Vikram Singh"
+    ],
+    "emails": [
+      "~Amit_Vikram_Singh1",
+      "~CHANDRA_SHEKAR_LAKSHMINARAYANAN1"
+    ],
+    "rank": 1249
+  },
+  {
+    "url": "https://openreview.net/forum?id=k9EHBqXDEOX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5
+    ],
+    "rating": "5.56",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Asynchronous and parallel implementation of standard reinforcement learning (RL) algorithms is a key enabler of the tremendous success of modern RL. \nAmong many asynchronous RL algorithms, arguably the most popular and effective one is the asynchronous advantage actor-critic (A3C) algorithm.  \nAlthough A3C is becoming the workhorse of RL, its theoretical properties are still not well-understood, including the non-asymptotic analysis and the performance gain of parallelism (a.k.a. speedup). \nThis paper revisits the A3C algorithm with TD(0) for the critic update, termed A3C-TD(0), with provable convergence guarantees. \nWith linear value function approximation for the TD update, the convergence of A3C-TD(0) is established under both i.i.d. and Markovian sampling. Under i.i.d. sampling, A3C-TD(0) obtains sample complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>2.5</mn></mrow></msup><mrow><mo>/</mo></mrow><mi>N</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> per worker to achieve <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> accuracy, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> is the number of workers.  Compared to the best-known sample complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>2.5</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for two-timescale AC, A3C-TD(0) achieves \\emph{linear speedup}, which justifies the advantage of parallelism and asynchrony in AC algorithms theoretically for the first time.\nNumerical tests on synthetically generated instances and OpenAI Gym environments have been provided to verify our theoretical analysis. ",
+    "title": "Asynchronous Advantage Actor Critic: Non-asymptotic Analysis and Linear Speedup",
+    "authors": [
+      "Han Shen",
+      "Kaiqing Zhang",
+      "Mingyi Hong",
+      "Tianyi Chen"
+    ],
+    "emails": [
+      "rpi.edu",
+      "~Kaiqing_Zhang3",
+      "~Mingyi_Hong1",
+      "~Tianyi_Chen1"
+    ],
+    "rank": 1250
+  },
+  {
+    "url": "https://openreview.net/forum?id=9w03rTs7w5",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      3,
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Transfer Learning has shown great potential to enhance the single-agent Reinforcement Learning (RL) efficiency, by sharing learned policies of previous tasks. Similarly, in multiagent settings, the learning performance can also be promoted if agents can share knowledge between each other. However, it remains an open question of how an agent should learn from other agents' knowledge. In this paper, we propose a novel multiagent option-based policy transfer (MAOPT) framework to improve multiagent learning efficiency. Our framework learns what advice to give to each agent and when to terminate it by modeling multiagent policy transfer as the option learning problem. MAOPT provides different kinds of variants which can be classified into two types in terms of the experience used during training. One type is the MAOPT with the Global Option Advisor which has the access to the global information of the environment. However, in many realistic scenarios, we can only obtain each agent's local information due to the partial observation. The other type contains MAOPT with the Local Option Advisor and MAOPT with the Successor Representation Option (SRO) which are suitable for this setting and collect each agent's local experience for the update. In many cases, each agent's experience is inconsistent with each other which causes the option-value estimation to oscillate and to become inaccurate. SRO is used to handle the experience inconsistency by decoupling the dynamics of the environment from the rewards to learn the option-value function under each agent's preference. MAOPT can be easily combined with existing deep RL approaches. Experimental results show it significantly boosts the performance of existing deep RL methods in both discrete and continuous state spaces.",
+    "title": "Transfer among Agents: An Efficient Multiagent Transfer Learning Framework",
+    "authors": [
+      "Tianpei Yang",
+      "Jianye HAO",
+      "Weixun Wang",
+      "Hongyao Tang",
+      "Zhaopeng Meng",
+      "Hangyu Mao",
+      "Dong Li",
+      "Wulong Liu",
+      "Yujing Hu",
+      "Yingfeng Chen",
+      "Changjie Fan"
+    ],
+    "emails": [
+      "~Tianpei_Yang1",
+      "~Yingfeng_Chen1",
+      "~Hongyao_Tang1",
+      "~Weixun_Wang1",
+      "~Wulong_Liu1",
+      "~Jianye_HAO1",
+      "corp.netease.com",
+      "~Zhaopeng_Meng1",
+      "huawei.com",
+      "~Changjie_Fan1"
+    ],
+    "rank": 1251
+  },
+  {
+    "url": "https://openreview.net/forum?id=qbRv1k2AcH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we demonstrate how to do automated higher-order logic theorem proving in the presence of a large knowledge base of potential premises without learning from human proofs. We augment the exploration of premises based on a simple tf-idf (term frequency-inverse document frequency) based lookup in a deep reinforcement learning scenario. Our experiments show that our theorem prover trained with this exploration mechanism but no human proofs, dubbed DeepHOL Zero, outperforms provers that are trained only on human proofs. It approaches the performance of a prover trained by a combination of imitation and reinforcement learning. We perform multiple experiments to understand the importance of the underlying assumptions that make our exploration approach work, thus explaining our design choices.",
+    "title": "Learning to Reason in Large Theories without Imitation",
+    "authors": [
+      "Kshitij Bansal",
+      "Christian Szegedy",
+      "Markus Norman Rabe",
+      "Sarah M. Loos",
+      "Viktor Toman"
+    ],
+    "emails": [
+      "~Markus_Norman_Rabe1",
+      "~Kshitij_Bansal1",
+      "~Christian_Szegedy1",
+      "gmail.com",
+      "ist.ac.at"
+    ],
+    "rank": 1252
+  },
+  {
+    "url": "https://openreview.net/forum?id=hpH98mK5Puk",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "Large-scale language models such as BERT have achieved state-of-the-art performance across a wide range of NLP tasks. Recent studies, however, show that such BERT-based models are vulnerable facing the threats of textual adversarial attacks. We aim to address this problem from an information-theoretic perspective, and propose InfoBERT, a novel learning framework for robust \ufb01ne-tuning of pre-trained language models. InfoBERT contains two mutual-information-based regularizers for model training: (i) an Information Bottleneck regularizer, which suppresses noisy mutual information between the input and the feature representation; and (ii) a Robust Feature regularizer, which increases the mutual information between local robust features and global features. We provide a principled way to theoretically analyze and improve the robustness of representation learning for language models in both standard and adversarial training. Extensive experiments demonstrate that InfoBERT achieves state-of-the-art robust accuracy over several adversarial datasets on Natural Language Inference (NLI) and Question Answering (QA) tasks.\nOur code is available at <a href=\"https://github.com/AI-secure/InfoBERT\" target=\"_blank\" rel=\"nofollow\">https://github.com/AI-secure/InfoBERT</a>.",
+    "title": "InfoBERT: Improving Robustness of Language Models from An Information Theoretic Perspective",
+    "authors": [
+      "Boxin Wang",
+      "Shuohang Wang",
+      "Yu Cheng",
+      "Zhe Gan",
+      "Ruoxi Jia",
+      "Bo Li",
+      "Jingjing Liu"
+    ],
+    "emails": [
+      "~Jingjing_Liu2",
+      "vt.edu",
+      "~Boxin_Wang1",
+      "~Shuohang_Wang1",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1",
+      "~Bo_Li19"
+    ],
+    "rank": 1253
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZS-9XoX20AV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      6,
+      3
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Graph Neural Networks (GNNs) learn effective node/graph representations by aggregating the attributes of neighboring nodes, which commonly derives a single representation mixing the information of graph structure and node attributes. However, these two kinds of information might be semantically inconsistent and could be useful for different tasks. In this paper, we aim at learning node/graph representations with Structure-Attribute Disentanglement (GraphSAD). We propose to disentangle graph structure and node attributes into two distinct sets of representations, and such disentanglement can be done in either the input or the embedding space. We further design a metric to quantify the extent of such a disentanglement. Extensive experiments on multiple datasets show that our approach can indeed disentangle the semantics of graph structure and node attributes, and it achieves superior performance on both node and graph classification tasks.",
+    "title": "GraphSAD: Learning Graph Representations with Structure-Attribute Disentanglement",
+    "authors": [
+      "Minghao Xu",
+      "Hang Wang",
+      "Bingbing Ni",
+      "Wenjun Zhang",
+      "Jian Tang"
+    ],
+    "emails": [
+      "~Hang_Wang1",
+      "~Wenjun_Zhang3",
+      "~Minghao_Xu1",
+      "~Jian_Tang1",
+      "~Bingbing_Ni3"
+    ],
+    "rank": 1254
+  },
+  {
+    "url": "https://openreview.net/forum?id=rYt0p0Um9r",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "    Over-parameterization is a recent topic of much interest in the machine learning community.  While over-parameterized neural networks are capable of perfectly fitting (interpolating) training data, these networks often perform well on test data, thereby contradicting classical learning theory.  Recent work provided an explanation for this phenomenon by introducing the double descent curve, showing that increasing model capacity past the interpolation threshold leads to a decrease in test error.  In line with this, it was recently shown empirically and theoretically that increasing neural network capacity through width leads to double descent.  In this work, we analyze the effect of increasing depth on test performance.  In contrast to what is observed for increasing width, we demonstrate through a variety of classification experiments on CIFAR10 and ImageNet-32 using ResNets and fully-convolutional networks that test performance worsens beyond a critical depth.  We posit an explanation for this phenomenon by drawing intuition from the principle of minimum norm solutions in linear networks.  \n",
+    "title": "Do Deeper Convolutional Networks Perform Better?",
+    "authors": [
+      "Eshaan Nichani",
+      "Adityanarayanan Radhakrishnan",
+      "Caroline Uhler"
+    ],
+    "emails": [
+      "~Eshaan_Nichani1",
+      "~Adityanarayanan_Radhakrishnan1",
+      "~Caroline_Uhler1"
+    ],
+    "rank": 1255
+  },
+  {
+    "url": "https://openreview.net/forum?id=vcKVhY7AZqK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.56",
+    "confidences": [
+      5,
+      1,
+      3
+    ],
+    "abstract": "How can we measure the \u201ccomplexity\u201d of a learning task so that we can compare one task to another? From classical information theory, we know that entropy is a useful measure of the complexity of a random variable and provides a lower bound on the minimum expected number of bits needed for transmitting its state. In this paper, we propose to measure the complexity of a learning task by the minimum expected number of questions that need to be answered to solve the task. For example, the minimum expected number of patches that need to be observed to classify FashionMNIST images. We prove several properties of the proposed complexity measure, including connections with classical entropy and sub-additivity for multiple tasks. As the computation of the minimum expected number of questions is generally intractable, we propose a greedy procedure called \u201cinformation pursuit\u201d (IP), which selects one question at a time depending on previous questions and their answers. This requires learning a probabilistic generative model relating data and questions to the task, for which we employ variational autoencoders and normalizing flows. We illustrate the usefulness of the proposed measure on various binary image classification tasks using image patches as the query set. Our results indicate that the complexity of a classification task increases as signal-to-noise ratio decreases, and that classification of the KMNIST dataset is more complex than classification of the FashionMNIST dataset. As a byproduct of choosing patches as queries, our approach also provides a principled way of determining which pixels in an image are most informative for a task.",
+    "title": "Quantifying Task Complexity Through Generalized Information Measures",
+    "authors": [
+      "Aditya Chattopadhyay",
+      "Benjamin David Haeffele",
+      "Donald Geman",
+      "Rene Vidal"
+    ],
+    "emails": [
+      "~Aditya_Chattopadhyay1",
+      "~Rene_Vidal1",
+      "~Donald_Geman2",
+      "~Benjamin_David_Haeffele1"
+    ],
+    "rank": 1256
+  },
+  {
+    "url": "https://openreview.net/forum?id=qiAxL3Xqx1o",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.56",
+    "confidences": [
+      3,
+      4,
+      1,
+      4,
+      4
+    ],
+    "abstract": "We study the fundamental problem of graph generation. Specifically, we treat graph generation from a geometric perspective by associating each node with a position in space and then connecting the edges based on a similarity function. We then provide new solutions to the key challenges that prevent the widespread application of this classical geometric interpretation: (1) modeling complex relations, (2) modeling isomorphic graphs consistently, and (3) fully exploiting the latent distribution.\nOur main contribution is dubbed as the geometric graph (GG) generative adversarial network (GAN), which is a Wasserstein GAN that addresses the above challenges. GG-GAN is permutation equivariant and easily scales to generate graphs of tens of thousands of nodes. GG-GAN also strikes a good trade-off between novelty and modeling the distribution statistics, being competitive or surpassing the state-of-the-art methods that are either slower or that are non-equivariant, or that exploit problem-specific knowledge.",
+    "title": "GG-GAN: A Geometric Graph Generative Adversarial Network",
+    "authors": [
+      "Igor Krawczuk",
+      "Pedro Abranches",
+      "Andreas Loukas",
+      "Volkan Cevher"
+    ],
+    "emails": [
+      "epfl.ch",
+      "~Andreas_Loukas1",
+      "~Volkan_Cevher1",
+      "~Igor_Krawczuk1"
+    ],
+    "rank": 1257
+  },
+  {
+    "url": "https://openreview.net/forum?id=lmTWnm3coJJ",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Neural network training can easily overfit noisy labels resulting in poor generalization performance. Existing methods address this problem by (1) filtering out the noisy data and only using the clean data for training or (2) relabeling the noisy data by the model during training or by another model trained only on a clean dataset. However, the former does not leverage the features' information of wrongly-labeled data, while the latter may produce wrong pseudo-labels for some data and introduce extra noises. In this paper, we propose a smooth transition and interplay between these two strategies as a curriculum that selects training samples dynamically. In particular, we start with learning from clean data and then gradually move to learn noisy-labeled data with pseudo labels produced by a time-ensemble of the model and data augmentations. Instead of using the instantaneous loss computed at the current step, our data selection is based on the dynamics of both the loss and output consistency for each sample across historical steps and different data augmentations, resulting in more precise detection of both clean labels and correct pseudo labels. On multiple benchmarks of noisy labels, we show that our curriculum learning strategy can significantly improve the test accuracy without any auxiliary model or extra clean data.",
+    "title": "Robust Curriculum Learning: from clean label detection to noisy label self-correction",
+    "authors": [
+      "Tianyi Zhou",
+      "Shengjie Wang",
+      "Jeff Bilmes"
+    ],
+    "emails": [
+      "~Jeff_Bilmes1",
+      "~Shengjie_Wang1",
+      "~Tianyi_Zhou1"
+    ],
+    "rank": 1258
+  },
+  {
+    "url": "https://openreview.net/forum?id=Aj4_e50nB8",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "A computationally expensive and memory intensive neural network lies behind the recent success of language representation learning. Knowledge distillation, a major technique for deploying such a vast language model in resource-scarce environments, transfers the knowledge on individual word representations learned without restrictions. In this paper, inspired by the recent observations that language representations are relatively positioned and have more semantic knowledge as a whole, we present a new knowledge distillation strategy for language representation learning that transfers the contextual knowledge via two types of relationships across representations: Word Relation and Layer Transforming Relation. We validate the effectiveness of our method on challenging benchmarks of language understanding tasks. The code will be released.",
+    "title": "Contextual Knowledge Distillation for Transformer Compression",
+    "authors": [
+      "Geondo Park",
+      "Gyeongman Kim",
+      "Eunho Yang"
+    ],
+    "emails": [
+      "~Eunho_Yang1",
+      "~Geondo_Park1",
+      "~Gyeongman_Kim1"
+    ],
+    "rank": 1259
+  },
+  {
+    "url": "https://openreview.net/forum?id=NGBY716p1VR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Current neural-network-based classifiers are susceptible to adversarial examples. The most empirically successful approach to defending against such adversarial examples is adversarial training, which incorporates a strong self-attack during training to enhance its robustness. This approach, however, is computationally expensive and hence is hard to scale up. A recent work, called fast adversarial training, has shown that it is possible to markedly reduce computation time without sacrificing significant performance. This approach incorporates simple self-attacks, yet it can only run for a limited number of training epochs, resulting in sub-optimal performance. In this paper, we conduct experiments to understand the behavior of fast adversarial training and show the key to its success is the ability to recover from overfitting to weak attacks. We then extend our findings to improve fast adversarial training, demonstrating superior robust accuracy to strong adversarial training, with much-reduced training time.",
+    "title": "Towards Understanding Fast Adversarial Training",
+    "authors": [
+      "Bai Li",
+      "Shiqi Wang",
+      "Suman Jana",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "~Lawrence_Carin2",
+      "~Suman_Jana1",
+      "~Bai_Li1",
+      "~Shiqi_Wang2"
+    ],
+    "rank": 1260
+  },
+  {
+    "url": "https://openreview.net/forum?id=IUaOP8jQfHn",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.56",
+    "confidences": [
+      3,
+      3,
+      1,
+      2
+    ],
+    "abstract": "Perceiving the world in terms of objects and tracking them through time is a crucial prerequisite for reasoning and scene understanding. Recently, several methods have been proposed for unsupervised learning of object-centric representations. However, since these models have been evaluated with respect to different downstream tasks, it remains unclear how they compare in terms of basic perceptual abilities such as detection, figure-ground segmentation and tracking of individual objects. To close this gap, we design a benchmark with three datasets of varying complexity and seven additional test sets which feature challenging tracking scenarios relevant for natural videos. Using this benchmark, we compare the perceptual abilities of four unsupervised object-centric learning approaches: ViMON, a video-extension of MONet, based on a recurrent spatial attention mechanism, OP3, which exploits clustering via spatial mixture models, as well as TBA and SCALOR, which use an explicit factorization via spatial transformers. Our results suggest that architectures with unconstrained latent representations and full-image object masks such as ViMON and OP3 are able to learn more powerful representations in terms of object detection, segmentation and tracking than the explicitly parameterized spatial transformer based architecture of TBA and SCALOR. We also observe that none of the methods are able to gracefully handle the most challenging tracking scenarios despite their synthetic nature, suggesting that our benchmark may provide fruitful guidance towards learning more robust object-centric video representations.",
+    "title": "Benchmarking Unsupervised Object Representations for Video Sequences",
+    "authors": [
+      "Marissa A. Weis",
+      "Kashyap Chitta",
+      "Yash Sharma",
+      "Wieland Brendel",
+      "Matthias Bethge",
+      "Andreas Geiger",
+      "Alexander S Ecker"
+    ],
+    "emails": [
+      "~Wieland_Brendel1",
+      "~Marissa_A._Weis1",
+      "~Andreas_Geiger3",
+      "~Kashyap_Chitta1",
+      "~Alexander_S_Ecker1",
+      "~Yash_Sharma1",
+      "~Matthias_Bethge2"
+    ],
+    "rank": 1261
+  },
+  {
+    "url": "https://openreview.net/forum?id=MbM_gvIB3Y4",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.56",
+    "confidences": [
+      3,
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Mutual information maximization provides an appealing formalism for learning representations of data. In the context of reinforcement learning, such representations can accelerate learning by discarding irrelevant and redundant information, while retaining the information necessary for control. Much of the prior work on these methods has addressed the practical difficulties of estimating mutual information from samples of high-dimensional observations, while comparatively less is understood about \\emph{which} mutual information objectives are sufficient for RL from a theoretical perspective. In this paper we identify conditions under which representations that maximize specific mutual-information objectives are theoretically sufficient for learning and representing the optimal policy. Somewhat surprisingly, we find that several popular objectives can yield insufficient representations given mild and common assumptions on the structure of the MDP. We corroborate our theoretical results with deep RL experiments on a simulated game environment with visual observations.",
+    "title": "Which Mutual-Information Representation Learning Objectives are Sufficient for Control?",
+    "authors": [
+      "Kate Rakelly",
+      "Abhishek Gupta",
+      "Carlos Florensa",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Sergey_Levine1",
+      "~Kate_Rakelly1",
+      "~Abhishek_Gupta1",
+      "~Carlos_Florensa1"
+    ],
+    "rank": 1262
+  },
+  {
+    "url": "https://openreview.net/forum?id=_kxlwvhOodK",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      6
+    ],
+    "rating": "5.56",
+    "confidences": [
+      3,
+      4,
+      2
+    ],
+    "abstract": "Growing applications of generative models have led to new threats such as malicious personation and digital copyright infringement. \nOne solution to these threats is model attribution, i.e., the identification of user-end models where the contents under question are generated.\nExisting studies showed empirical feasibility of attribution through a centralized classifier trained on all existing user-end models. \nHowever, this approach is not scalable in a reality where the number of models ever grows. Neither does it provide an attributability guarantee.\nTo this end, this paper studies decentralized attribution, which relies on binary classifiers associated with each user-end model. \nEach binary classifier is parameterized by a user-specific key and distinguishes its associated model distribution from the authentic data distribution. \nWe develop sufficient conditions of the keys that guarantee an attributability lower bound.\nOur method is validated on MNIST, CelebA, and FFHQ datasets. We also examine the trade-off between generation quality and robustness of attribution against adversarial post-processes.",
+    "title": " Decentralized Attribution of Generative Models",
+    "authors": [
+      "Changhoon Kim",
+      "Yi Ren",
+      "Yezhou Yang"
+    ],
+    "emails": [
+      "~Yi_Ren3",
+      "~Changhoon_Kim1",
+      "~Yezhou_Yang1"
+    ],
+    "rank": 1263
+  },
+  {
+    "url": "https://openreview.net/forum?id=vrCiOrqgl3B",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.56",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Optimal transport (OT) provides a way of measuring distances between distributions that depends on the geometry of the sample space. In light of recent advances in solving the OT problem, OT distances are widely used as loss functions in minimum distance estimation. Despite its prevalence and advantages, however, OT is extremely sensitive to outliers. A single adversarially-picked outlier can increase OT distance arbitrarily. To address this issue, in this work we propose an outlier-robust OT formulation. Our formulation is convex but challenging to scale at a first glance. We proceed by deriving an equivalent formulation based on cost truncation that is easy to incorporate into modern stochastic algorithms for regularized OT. We demonstrate our model applied to mean estimation under the Huber contamination model in simulation as well as outlier detection on real data.",
+    "title": "Outlier Robust Optimal Transport",
+    "authors": [
+      "Debarghya Mukherjee",
+      "Aritra Guha",
+      "Justin Solomon",
+      "Yuekai Sun",
+      "Mikhail Yurochkin"
+    ],
+    "emails": [
+      "~Justin_Solomon1",
+      "~Mikhail_Yurochkin1",
+      "~Aritra_Guha1",
+      "~Yuekai_Sun1",
+      "~Debarghya_Mukherjee1"
+    ],
+    "rank": 1264
+  },
+  {
+    "url": "https://openreview.net/forum?id=oBmpWzJTCa4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.55",
+    "confidences": [
+      2,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Learning to control a safety-critical system with latent dynamics (e.g.  for deep brain stimulation) requires judiciously taking calculated risks to gain information. We present a probabilistically-safe, meta-active learning approach to efficiently learn system dynamics and optimal configurations. The key to our approach is a novel integration of meta-learning and chance-constrained optimization in which we 1) meta-learn an LSTM-based embedding of the active learning sample history, 2) encode a deep learning-based acquisition function with this embedding into a mixed-integer linear program (MILP), and 3) solve the MILP to find the optimal action trajectory, trading off the predicted information gain from the acquisition function and the likelihood of safe control. We set a new state-of-the-art in active learning to control a high-dimensional system with latent dynamics, achieving a 46% increase in information gain and a 20% speedup in computation time.  We then outperform baseline methods in learning the optimal parameter settings for deep brain stimulation in rats to enhance the rats\u2019 performance on a cognitive task while safely avoiding unwanted side effects (i.e., triggering seizures).",
+    "title": "Meta-Active Learning in Probabilistically-Safe Optimization",
+    "authors": [
+      "Mariah L Schrum",
+      "Mark Connolly",
+      "Eric Cole",
+      "Mihir Ghetiya",
+      "Robert Gross",
+      "Matthew C. Gombolay"
+    ],
+    "emails": [
+      "~Mariah_L_Schrum1",
+      "emory.edu",
+      "~Robert_Gross1",
+      "~Matthew_C._Gombolay1"
+    ],
+    "rank": 1265
+  },
+  {
+    "url": "https://openreview.net/forum?id=VCAXR34cp59",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4
+    ],
+    "rating": "5.55",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Constructing disentangled representations is known to be a difficult task, especially in the unsupervised scenario. The dominating paradigm of unsupervised disentanglement is currently to train a generative model that separates different factors of variation in its latent space. This separation is typically enforced by training with specific regularization terms in the model's objective function. These terms, however, introduce additional hyperparameters responsible for the trade-off between disentanglement and generation quality. While tuning these hyperparameters is crucial for proper disentanglement, it is often unclear how to tune them without external supervision.\n\nThis paper investigates an alternative route to disentangled representations. Namely, we propose to extract such representations from the state-of-the-art GANs trained without disentangling terms in their objectives. This paradigm of post hoc disentanglement employs little or no hyperparameters when learning representations, while achieving results on par with existing state-of-the-art, as shown by comparison in terms of established disentanglement metrics, fairness, and the abstract reasoning task.\nAll our code and models are publicly available.",
+    "title": "On Disentangled Representations Extracted from Pretrained GANs",
+    "authors": [
+      "Valentin Khrulkov",
+      "Leyla Mirvakhabova",
+      "Ivan Oseledets",
+      "Artem Babenko"
+    ],
+    "emails": [
+      "~Leyla_Mirvakhabova1",
+      "~Artem_Babenko1",
+      "~Ivan_Oseledets1",
+      "~Valentin_Khrulkov1"
+    ],
+    "rank": 1266
+  },
+  {
+    "url": "https://openreview.net/forum?id=w8iCTOJvyD",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      7
+    ],
+    "rating": "5.55",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "From CNNs to attention mechanisms, encoding inductive biases into neural networks has been a fruitful source of improvement in machine learning. Auxiliary losses are a general way of encoding biases in order to help networks learn better representations by adding extra terms to the loss function. However, since they are minimized on the training data, they suffer from the same generalization gap as regular task losses. Moreover, by changing the loss function, the network is optimizing a different objective than the one we care about. In this work we solve both problems: first, we take inspiration from transductive learning and note that, after receiving an input but before making a prediction, we can fine-tune our models on any unsupervised objective. We call this process tailoring, because we customize the model to each input. Second, we formulate a nested optimization (similar to those in meta-learning) and train our models to perform well on the task loss after adapting to the tailoring loss. The advantages of tailoring and meta-tailoring are discussed theoretically and demonstrated empirically on several diverse examples: encoding inductive conservation laws from physics to improve predictions, improving local smoothness to increase robustness to adversarial examples, and using contrastive losses on the query image to improve generalization.",
+    "title": "Tailoring: encoding inductive biases by optimizing unsupervised objectives at prediction time",
+    "authors": [
+      "Ferran Alet",
+      "Kenji Kawaguchi",
+      "Maria Bauza Villalonga",
+      "Nurullah Giray Kuru",
+      "Tomas Perez",
+      "Leslie Pack Kaelbling"
+    ],
+    "emails": [
+      "~Maria_Bauza_Villalonga1",
+      "~Leslie_Pack_Kaelbling1",
+      "~Ferran_Alet1",
+      "~Tomas_Perez1",
+      "mit.edu",
+      "~Kenji_Kawaguchi1"
+    ],
+    "rank": 1267
+  },
+  {
+    "url": "https://openreview.net/forum?id=xCm8kiWRiBT",
+    "decision": "Significant concerns (Do not publish)",
+    "ratings": [
+      7,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.55",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "We initiate the study of adversarial attacks on models for binary (i.e. black and white) image classification. Although there has been a great deal of work on attacking models for colored and grayscale images, little is known about attacks on models for binary images. Models trained to classify binary images are used in text recognition applications such as check processing, license plate recognition, invoice processing, and many others. In contrast to colored and grayscale images, the search space of attacks on binary images is extremely restricted and noise cannot be hidden with minor perturbations in each pixel. Thus, the optimization landscape of attacks on binary images introduces new fundamental challenges.\n\nIn this paper we introduce a new attack algorithm called Scar, designed to fool classifiers of binary images. We show that Scar significantly outperforms existing L0 attacks applied to the binary setting and use it to demonstrate the vulnerability of real-world text recognition systems. Scar\u2019s strong performance in practice contrasts with hardness results that show the existence of worst-case classifiers for binary images that are robust to large perturbations. In many cases, altering a single pixel is sufficient to trick Tesseract, a popular open-source text recognition system, to misclassify a word as a different word in the English dictionary. We also demonstrate the vulnerability of check recognition by fooling commercial check processing systems used by major US banks for mobile deposits. These systems are substantially harder to fool since they classify both the handwritten amounts in digits and letters, independently. Nevertheless, we generalize Scar to design attacks that fool state-of-the-art check processing systems using unnoticeable perturbations that lead to misclassification of deposit amounts. Consequently, this is a powerful method to perform financial fraud.",
+    "title": "Adversarial Attacks on Binary Image Recognition Systems",
+    "authors": [
+      "Eric Balkanski",
+      "Harrison Chase",
+      "Kojin Oshiba",
+      "Alexander Rilee",
+      "Yaron Singer",
+      "Richard Wang"
+    ],
+    "emails": [
+      "robustintelligence.com",
+      "~Eric_Balkanski2"
+    ],
+    "rank": 1268
+  },
+  {
+    "url": "https://openreview.net/forum?id=tEw4vEEhHjI",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5
+    ],
+    "rating": "5.55",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Approximate Bayesian methods can mitigate overconfidence in ReLU networks. However, far away from the training data, even Bayesian neural networks (BNNs) can still underestimate uncertainty and thus be overconfident. We suggest to fix this by considering an infinite number of ReLU features over the input domain that are never part of the training process and thus remain at prior values. Perhaps surprisingly, we show that this model leads to a tractable Gaussian process (GP) term that can be added to a pre-trained BNN's posterior at test time with negligible cost overhead. The BNN then yields structured uncertainty in the proximity of training data, while the GP prior calibrates uncertainty far away from them. As a key contribution, we prove that the added uncertainty yields cubic predictive variance growth, and thus the ideal uniform (maximum entropy) confidence in multi-class classification far from the training data.",
+    "title": "Fixing Asymptotic Uncertainty of Bayesian Neural Networks with Infinite ReLU Features",
+    "authors": [
+      "Agustinus Kristiadi",
+      "Matthias Hein",
+      "Philipp Hennig"
+    ],
+    "emails": [
+      "~Philipp_Hennig1",
+      "~Agustinus_Kristiadi1",
+      "~Matthias_Hein2"
+    ],
+    "rank": 1269
+  },
+  {
+    "url": "https://openreview.net/forum?id=7YctWnyhjpL",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5
+    ],
+    "rating": "5.55",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "As the range of tasks performed by a general vision system expands, executing multiple tasks accurately and ef\ufb01ciently in  a single network has become an important and still open problem. Recent computer vision approaches address this problem by branching networks, or by a channel-wise modulation of the network feature-maps with task speci\ufb01c vectors. We present a novel architecture that uses a dedicated top-down control network to modify the activation of all the units in the main recognition network in a manner that depends on the selected task, image content, and spatial location. We show the effectiveness of our scheme by achieving signi\ufb01cantly better results than alternative state-of-the-art approaches on four datasets. We further demonstrate our advantages in terms of task selectivity, scaling the number of tasks and interpretability. \nCode is supplied in the supplementary materials and will be publicly available.",
+    "title": "Multi-Task Learning by a Top-Down Control Network",
+    "authors": [
+      "Hila Levi",
+      "Shimon Ullman"
+    ],
+    "emails": [
+      "~Hila_Levi1",
+      "~Shimon_Ullman1"
+    ],
+    "rank": 1270
+  },
+  {
+    "url": "https://openreview.net/forum?id=kdm4Lm9rgB",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.54",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "State-of-the-art deep reinforcement learning (DRL) algorithms tend to overfit in some specific environments due to the lack of data diversity in training. To mitigate the model discrepancy between training and target (testing) environments, domain randomization (DR) can generate plenty of environments with a sufficient diversity by randomly sampling environment parameters in simulator. Though standard DR using a uniform distribution improves the average performance on the whole range of environments, the worst-case environment is usually neglected without any performance guarantee. Since the average and worst-case performance are equally important for the generalization in RL, in this paper, we propose a policy optimization approach for concurrently improving the policy's performance in the average case (i.e., over all possible environments) and the worst-case environment. We theoretically derive a lower bound for the worst-case performance of a given policy over all environments. Guided by this lower bound, we formulate an optimization problem which aims to optimize the policy and sampling distribution together, such that the constrained expected performance of all environments is maximized. We prove that the worst-case performance is monotonically improved by iteratively solving this optimization problem. Based on the proposed lower bound, we develop a practical algorithm, named monotonic robust policy optimization (MRPO), and validate MRPO on several robot control tasks. By modifying the environment parameters in simulation, we obtain environments for the same task but with different transition dynamics for training and testing. We demonstrate that MRPO can improve both the average and worst-case performance in the training environments, and facilitate the learned policy with a better generalization capability in unseen testing environments.",
+    "title": "Monotonic Robust Policy Optimization with Model Discrepancy",
+    "authors": [
+      "Yuankun Jiang",
+      "Chenglin Li",
+      "Junni Zou",
+      "Wenrui Dai",
+      "Hongkai Xiong"
+    ],
+    "emails": [
+      "~Chenglin_Li2",
+      "~Hongkai_Xiong1",
+      "~Wenrui_Dai1",
+      "~Junni_Zou1",
+      "~Yuankun_Jiang1"
+    ],
+    "rank": 1271
+  },
+  {
+    "url": "https://openreview.net/forum?id=4P35MfnBQIY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.54",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Knowledge Tracing (KT), tracking a human's knowledge acquisition, is a central component in online learning and AI in Education. In this paper, we present a simple, yet effective strategy to improve the generalization ability of KT models: we propose three types of novel data augmentation, coined replacement, insertion, and deletion, along with corresponding regularization losses that impose certain consistency or monotonicity bias on model's predictions for the original and augmented sequence. Extensive experiments on various KT benchmarks show that our regularization scheme significantly improve the prediction performances, under 3 widely-used neural networks and 4 public benchmarks for KT, e.g., it yields 6.3% improvement in AUC under the DKT model and the ASSISTmentsChall dataset. ",
+    "title": "Consistency and Monotonicity Regularization for Neural Knowledge Tracing",
+    "authors": [
+      "Seewoo Lee",
+      "Youngduck Choi",
+      "Juneyoung Park",
+      "Byungsoo Kim",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Seewoo_Lee1",
+      "~Youngduck_Choi2",
+      "~Byungsoo_Kim1",
+      "~Juneyoung_Park1",
+      "~Jinwoo_Shin1"
+    ],
+    "rank": 1272
+  },
+  {
+    "url": "https://openreview.net/forum?id=m4PC1eUknQG",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.54",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Opponent modeling is essential to exploit sub-optimal opponents in strategic interactions. One key challenge facing opponent modeling is how to fast adapt to opponents with diverse styles of strategies. Most previous works focus on building explicit models to predict the opponents\u2019 styles or strategies directly. However, these methods require a large amount of data to train the model and lack the adaptability to new opponents of unknown styles. In this work, we propose a novel Learning to Exploit (L2E) framework for implicit opponent modeling. L2E acquires the ability to exploit opponents by a few interactions with different opponents during training so that it can adapt to new opponents with unknown styles during testing quickly. We propose a novel Opponent Strategy Generation (OSG) algorithm that produces effective opponents for training automatically. By learning to exploit the challenging opponents generated by OSG through adversarial training, L2E gradually eliminates its own strategy\u2019s weaknesses. Moreover, the generalization ability of L2E is significantly improved by training with diverse opponents, which are produced by OSG through diversity-regularized policy optimization. We evaluate the L2E framework on two poker games and one grid soccer game, which are the commonly used benchmark for opponent modeling. Comprehensive experimental results indicate that L2E quickly adapts to diverse styles of unknown opponents.",
+    "title": "L2E: Learning to Exploit Your Opponent",
+    "authors": [
+      "Zhe Wu",
+      "Kai Li",
+      "Hang Xu",
+      "Meng Zhang",
+      "Haobo Fu",
+      "Bo An",
+      "Junliang Xing"
+    ],
+    "emails": [
+      "~Kai_Li2",
+      "ia.ac.cn",
+      "~Junliang_Xing1",
+      "~Bo_An2",
+      "~Haobo_Fu2"
+    ],
+    "rank": 1273
+  },
+  {
+    "url": "https://openreview.net/forum?id=jGeOQt3oUl1",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      7,
+      6
+    ],
+    "rating": "5.54",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Normalizing flows are among the most popular paradigms in generative modeling, especially for images, primarily because we can efficiently evaluate the likelihood of a data point. This is desirable both for evaluating the fit of a model, and for ease of training, as maximizing the likelihood can be done by gradient descent. However, training normalizing flows comes with difficulties as well: models which produce good samples typically need to be extremely deep -- which comes with accompanying vanishing/exploding gradient problems. A very related problem is that they are often poorly \\emph{conditioned}: since they are parametrized as invertible maps from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup><mo accent=\"false\" stretchy=\"false\">\u2192</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup></math></mjx-assistive-mml></mjx-container>, and typical training data like images intuitively is lower-dimensional, the learned maps often have Jacobians that are close to being singular. \n\nIn our paper, we tackle representational aspects around depth and conditioning of normalizing flows---both for general invertible architectures, and for a particular common architecture---affine couplings. \n\nFor general invertible architectures, we prove that invertibility comes at a cost in terms of depth: we show examples where a much deeper normalizing flow model may need to be used to match the performance of a non-invertible generator. \n\nFor affine couplings, we first show that the choice of partitions isn't a likely bottleneck for depth: we show that any invertible linear map (and hence a permutation) can be simulated by a constant number of affine coupling layers, using a fixed partition. This shows that the extra flexibility conferred by 1x1 convolution layers, as in GLOW, can in principle be simulated by increasing the size by a constant factor. Next, in terms of conditioning, we show that affine couplings are universal approximators -- provided the Jacobian of the model is allowed to be close to singular. We furthermore empirically explore the benefit of different kinds of padding -- a common strategy for improving conditioning.",
+    "title": "Representational aspects of depth and conditioning in normalizing flows",
+    "authors": [
+      "Frederic Koehler",
+      "Viraj Mehta",
+      "Andrej Risteski"
+    ],
+    "emails": [
+      "~Frederic_Koehler1",
+      "~Andrej_Risteski2",
+      "~Viraj_Mehta1"
+    ],
+    "rank": 1274
+  },
+  {
+    "url": "https://openreview.net/forum?id=BntruCi1uvF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.54",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we present a policy gradient method that avoids exploratory noise injection and performs policy search over the deterministic landscape. By avoiding noise injection all sources of estimation variance can be eliminated in systems with deterministic dynamics (up to the initial state distribution). Since deterministic policy regularization is impossible using traditional non-metric measures such as the KL divergence, we derive a Wasserstein-based quadratic model for our purposes. We state conditions on the system model under which it is possible to establish a monotonic policy improvement guarantee, propose a surrogate function for policy gradient estimation, and show that it is possible to compute exact advantage estimates if both the state transition model and the policy are deterministic. Finally, we describe two novel robotic control environments---one with non-local rewards in the frequency domain and the other with a long horizon (8000 time-steps)---for which our policy gradient method (TDPO) significantly outperforms existing methods (PPO, TRPO, DDPG, and TD3).",
+    "title": "Truly Deterministic Policy Optimization",
+    "authors": [
+      "Ehsan Saleh",
+      "Saba Ghaffari",
+      "Matthew West",
+      "Tim Bretl"
+    ],
+    "emails": [
+      "~Ehsan_Saleh1",
+      "illinois.edu",
+      "~Tim_Bretl1",
+      "~Matthew_West1"
+    ],
+    "rank": 1275
+  },
+  {
+    "url": "https://openreview.net/forum?id=1cEEqSp9kXV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.54",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "The success of deep neural networks relied heavily on efficient stochastic gradient descent-like training methods. However, these methods are sensitive to initialization and hyper-parameters. \nIn this paper, a systematical method for finding multiple high-quality local optimal deep neural networks from a single training session, using the TRUST-TECH (TRansformation Under Stability-reTaining Equilibria Characterization) method, is introduced. \nTo realize effective TRUST-TECH searches to train deep neural networks on large datasets, a dynamic search paths (DSP) method is proposed to provide an improved search guidance in TRUST-TECH method. \nThe proposed DSP-TT method is implemented such that the computation graph remains constant during the search process, with only minor GPU memory overhead and requires just one training session to obtain multiple local optimal solutions (LOS). To take advantage of these LOSs, we also propose an improved ensemble method. Experiments on image classification datasets show that our method improves the testing performance by a substantial margin. Specifically, our fully-trained DSP-TT ResNet ensmeble improves the SGD baseline by 20\\% (CIFAR10) and 15\\%(CIFAR100). Furthermore, our method shows several advantages over other ensembling methods. ",
+    "title": "Constructing Multiple High-Quality Deep Neural Networks: A TRUST-TECH Based Approach",
+    "authors": [
+      "Zhiyong Hao",
+      "Hsiao-Dong Chiang",
+      "Bin Wang"
+    ],
+    "emails": [
+      "~Zhiyong_Hao1",
+      "~Hsiao-Dong_Chiang1",
+      "cornell.edu"
+    ],
+    "rank": 1276
+  },
+  {
+    "url": "https://openreview.net/forum?id=HZcDljfUljt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.54",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep Neural Networks(DNNs) have many parameters and activation data, and these both are expensive to implement. One method to reduce the size of the DNN is to quantize the pre-trained model by using a low-bit expression for weights and activations, using fine-tuning to recover the drop in accuracy. However, it is generally difficult to train neural networks which use low-bit expressions. One reason is that the weights in the middle layer of the DNN have a wide dynamic range and so when quantizing the wide dynamic range into a few bits, the step size becomes large, which leads to a large quantization error and finally a large degradation in accuracy. To solve this problem, this paper makes the following three contributions without using any additional learning parameters and hyper-parameters. First, we analyze how batch normalization, which causes the aforementioned problem, disturbs the fine-tuning of the quantized DNN. Second, based on these results, we propose a new pruning method called Pruning for Quantization (PfQ) which removes the filters that disturb the fine-tuning of the DNN while not affecting the inferred result as far as possible. Third, we propose a workflow of fine-tuning for quantized DNNs using the proposed pruning method(PfQ). Experiments using well-known models and datasets confirmed that the proposed method achieves higher performance with a similar model size than conventional quantization methods including fine-tuning.",
+    "title": "Filter pre-pruning for improved fine-tuning of quantized deep neural networks",
+    "authors": [
+      "Jun Nishikawa",
+      "Ryoji Ikegaya"
+    ],
+    "emails": [
+      "~Ryoji_Ikegaya1",
+      "~Jun_Nishikawa2"
+    ],
+    "rank": 1277
+  },
+  {
+    "url": "https://openreview.net/forum?id=jQ0XleVhYuT",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.54",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In this article, we consider the problem of high-dimensional conditional independence testing, which is a key building block in statistics and machine learning. We propose a double generative adversarial networks (GAN)-based inference procedure. We first introduce a double GANs framework to learn two generators, and integrate the two generators to construct a doubly-robust test statistic. We next consider multiple generalized covariance measures, and take their maximum as our test statistic. Finally, we obtain the empirical distribution of our test statistic through multiplier bootstrap. We show that our test controls type-I error, while the\npower approaches one asymptotically. More importantly, these theoretical guarantees are obtained under much weaker and practically more feasible conditions compared to existing tests. We demonstrate the efficacy of our test through both synthetic and real datasets.",
+    "title": "Double Generative Adversarial Networks for Conditional Independence Testing",
+    "authors": [
+      "Chengchun Shi",
+      "Tianlin Xu",
+      "Wicher Pieter Bergsma",
+      "Lexin Li"
+    ],
+    "emails": [
+      "~Chengchun_Shi1",
+      "~Lexin_Li1",
+      "~Wicher_Pieter_Bergsma1",
+      "~Tianlin_Xu1"
+    ],
+    "rank": 1278
+  },
+  {
+    "url": "https://openreview.net/forum?id=IgIk8RRT-Z",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.54",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "The emergence of CNNs in mainstream deployment has necessitated methods to design and train efficient architectures tailored to maximize the accuracy under diverse hardware and latency constraints. To scale these resource-intensive tasks with an increasing number of deployment targets, Once-For-All (OFA) proposed an approach to jointly train several models at once with a constant training cost. However, this cost remains as high as 40-50 GPU days and also suffers from a combinatorial explosion of sub-optimal model configurations. We seek to reduce this search space -- and hence the training budget -- by constraining search to models close to the accuracy-latency Pareto frontier. We incorporate insights of compound relationships between model dimensions to build CompOFA, a design space smaller by several orders of magnitude.  Through experiments on ImageNet, we demonstrate that even with simple heuristics we can achieve a 2x reduction in training time and 216x speedup in model search/extraction time compared to the state of the art, without loss of Pareto optimality! We also show that this smaller design space is dense enough to support equally accurate models for a similar diversity of hardware and latency targets, while also reducing the complexity of the training and subsequent extraction algorithms. Our source code is available at <a href=\"https://github.com/gatech-sysml/CompOFA\" target=\"_blank\" rel=\"nofollow\">https://github.com/gatech-sysml/CompOFA</a>",
+    "title": "CompOFA \u2013 Compound Once-For-All Networks for Faster Multi-Platform Deployment",
+    "authors": [
+      "Manas Sahni",
+      "Shreya Varshini",
+      "Alind Khare",
+      "Alexey Tumanov"
+    ],
+    "emails": [
+      "gatech.edu",
+      "~Alind_Khare1",
+      "~Manas_Sahni1"
+    ],
+    "rank": 1279
+  },
+  {
+    "url": "https://openreview.net/forum?id=9z_dNsC4B5t",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Batch normalization plays a crucial role when training deep neural networks. However, batch statistics become unstable with small batch sizes and are unreliable in the presence of distribution shifts. We propose MetaNorm, a simple yet effective meta-learning normalization. It tackles the aforementioned issues in a unified way by leveraging the meta-learning setting and learns to infer adaptive statistics for batch normalization. MetaNorm is generic, flexible and model-agnostic, making it a simple plug-and-play module that is seamlessly embedded into existing meta-learning approaches. It can be efficiently implemented by lightweight hypernetworks with low computational cost. We verify its effectiveness by extensive evaluation on representative tasks suffering from the small batch and domain shift problems: few-shot learning and domain generalization. We further introduce an even more challenging setting: few-shot domain generalization. Results demonstrate that MetaNorm consistently achieves better, or at least competitive, accuracy compared to existing batch normalization methods.  ",
+    "title": "MetaNorm: Learning to Normalize Few-Shot Batches Across Domains",
+    "authors": [
+      "Yingjun Du",
+      "Xiantong Zhen",
+      "Ling Shao",
+      "Cees G. M. Snoek"
+    ],
+    "emails": [
+      "~Xiantong_Zhen1",
+      "~Cees_G._M._Snoek1",
+      "~Yingjun_Du1",
+      "~Ling_Shao1"
+    ],
+    "rank": 1280
+  },
+  {
+    "url": "https://openreview.net/forum?id=vlcVTDaufN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      6,
+      7,
+      3
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Combinatorial problems with linear objective function play a central role in many computer science applications, and efficient algorithms for solving them are well known. However, the solutions to these problems are not differentiable with respect to the parameters specifying the problem instance \u2013 for example, shortest distance between two nodes in a graph is not a differentiable function of graph edge weights. Recently, attempts to integrate combinatorial and, more broadly, convex optimization solvers into gradient-trained models resulted in several approaches for differentiating over the solution vector to the optimization problem. However, in many cases, the interest is in differentiating over only the objective value, not the solution vector, and using existing approaches introduces unnecessary overhead. Here, we show how to perform gradient descent directly over the objective value of the solution to combinatorial problems. We demonstrate advantage of the approach in examples involving sequence-to-sequence modeling using differentiable encoder-decoder architecture with softmax or Gumbel-softmax, and in weakly supervised learning involving a convolutional, residual feed-forward network for image classification.\n",
+    "title": "Differentiable Combinatorial Losses through Generalized Gradients of Linear Programs",
+    "authors": [
+      "Xi Gao",
+      "Han Zhang",
+      "Aliakbar Panahi",
+      "Tom Arodz"
+    ],
+    "emails": [
+      "vcu.edu",
+      "~Han_Zhang6",
+      "~Aliakbar_Panahi1",
+      "~Tom_Arodz1"
+    ],
+    "rank": 1281
+  },
+  {
+    "url": "https://openreview.net/forum?id=Dmpi13JiqcX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Representations in large language models such as BERT encode a range of features into a single vector, which are predictive in the context of a multitude of downstream tasks. In this paper, we explore whether it is possible to learn disentangled representations  by identifying subnetworks in pre-trained models that encode distinct, complementary aspects of the representation. Concretely, we learn binary masks over transformer weights or hidden units to uncover the subset of features that correlate with a specific factor of variation. This sidesteps the need to train a disentangled model from scratch within a particular domain. We evaluate the ability of this method to disentangle representations of syntax and semantics, and sentiment from genre in the context of movie reviews. By combining this method with magnitude pruning we find that we can identify quite sparse subnetworks. Moreover, we find that this disentanglement-via-masking approach performs as well as or better than previously proposed methods based on variational autoencoders and adversarial training. ",
+    "title": "Disentangling Representations of Text by Masking Transformers",
+    "authors": [
+      "Xiongyi Zhang",
+      "Jan-Willem van de Meent",
+      "Byron C Wallace"
+    ],
+    "emails": [
+      "~Jan-Willem_van_de_Meent1",
+      "~Byron_C_Wallace1",
+      "~Xiongyi_Zhang2"
+    ],
+    "rank": 1282
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZAfeFYKUek5",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Unlike the conventional wisdom in statistical learning theory, the test error of a deep neural network (DNN) often demonstrates double descent: as the model complexity increases, it first follows a classical U-shaped curve and then shows a second descent. Through bias-variance decomposition, recent studies revealed that the bell-shaped variance is the major cause of model-wise double descent (when the DNN is widened gradually). This paper investigates epoch-wise double descent, i.e., the test error of a DNN also shows double descent as the number of training epochs increases. Specifically, we extend the bias-variance analysis to epoch-wise double descent, and reveal that the variance also contributes the most to the zero-one loss, as in model-wise double descent. Inspired by this result, we propose a novel metric called optimization variance to measure the diversity of model updates caused by the stochastic gradients of random training batches drawn in the same iteration. This metric can be estimated using samples from the training set only but correlates well with the test error. The proposed optimization variance can be used to predict the generalization ability of a DNN, and hence early stopping can be achieved without using any validation set.",
+    "title": "Optimization Variance:  Exploring Generalization Properties of DNNs",
+    "authors": [
+      "Xiao Zhang",
+      "Dongrui Wu",
+      "Haoyi Xiong",
+      "Bo Dai"
+    ],
+    "emails": [
+      "~Bo_Dai2",
+      "~Xiao_Zhang8",
+      "~Dongrui_Wu1",
+      "~Haoyi_Xiong1"
+    ],
+    "rank": 1283
+  },
+  {
+    "url": "https://openreview.net/forum?id=yKYiyoHG4N3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Multimodal program synthesis, which leverages different types of user input to synthesize a desired program, is an attractive way to scale program synthesis to challenging settings; however, it requires integrating noisy signals from the user (like natural language) with hard constraints on the program's behavior. This paper proposes an optimal neural synthesis approach where the goal is to find a program that satisfies user-provided constraints while also maximizing the program's score with respect to a neural model. Specifically, we focus on multimodal synthesis tasks in which the user intent is expressed using combination of natural language (NL) and input-output examples. At the core of our method is a top-down recurrent neural model that places distributions over abstract syntax trees conditioned on the NL input. This model not only allows for efficient search over the space of syntactically valid programs, but it allows us to leverage automated program analysis techniques for pruning the search space based on infeasibility of partial programs with respect to the user's constraints. The experimental results on a multimodal synthesis dataset (StructuredRegex) show that our method substantially outperforms prior state-of-the-art techniques in terms of accuracy and explores fewer states during search.",
+    "title": "Optimal Neural Program Synthesis from Multimodal Specifications",
+    "authors": [
+      "Xi Ye",
+      "Qiaochu Chen",
+      "Isil Dillig",
+      "Greg Durrett"
+    ],
+    "emails": [
+      "~Xi_Ye2",
+      "cs.utexas.edu",
+      "~Greg_Durrett1",
+      "~Isil_Dillig1"
+    ],
+    "rank": 1284
+  },
+  {
+    "url": "https://openreview.net/forum?id=mhEd8uOyNTI",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "While contextual representations from Transformer-based architectures have set a new standard for many NLP tasks, there is not yet a complete accounting of their inner workings. In particular, it is not entirely clear what aspects of sentence-level syntax are captured by these representations, nor how (if at all) they are built along the stacked layers of the network. In this paper, we aim to address such questions with a general class of input perturbation-based analyses of representations from Transformer networks pretrained on self-supervised objectives. Importing from computational and cognitive neuroscience the notion of representational invariance, we perform a series of probes designed to test the sensitivity of Transformer representations to several kinds of structure in sentences. Each probe involves swapping words in a sentence and comparing the representations from perturbed sentences against the original. We experiment with three different perturbations: (1) random permutations of n-grams of varying width, to test the scale at which a representation is sensitive to word position; (2) swapping of two spans which do or do not form a syntactic phrase, to test sensitivity to global phrase structure; and (3) swapping of two adjacent words which do or do not break apart a syntactic phrase, to test sensitivity to local phrase structure. We also connect our probe results to the Transformer architecture by relating the attention mechanism to syntactic distance between two words. Results from the three probes collectively suggest that Transformers build sensitivity to larger parts of the sentence along their layers, and that hierarchical phrase structure plays a role in this process. In particular, sensitivity to local phrase structure increases along deeper layers. Based on our analysis of attention, we show that this is at least partly explained by generally larger attention weights between syntactically distant words.",
+    "title": "Representational correlates of hierarchical phrase structure in deep language models",
+    "authors": [
+      "Matteo Alleman",
+      "Jonathan Mamou",
+      "Miguel A Del Rio",
+      "Hanlin Tang",
+      "Yoon Kim",
+      "SueYeon Chung"
+    ],
+    "emails": [
+      "alum.mit.edu",
+      "~Yoon_Kim1",
+      "columbia.edu",
+      "~Hanlin_Tang1",
+      "~SueYeon_Chung1",
+      "intel.com"
+    ],
+    "rank": 1285
+  },
+  {
+    "url": "https://openreview.net/forum?id=gBpYGXH9J7F",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We study the design of efficient online learning algorithms tolerant to adversarially corrupted rewards. In particular, we study settings where an online algorithm makes a prediction at each time step, and receives a stochastic reward from the environment that can be arbitrarily corrupted with probability <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c5B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mfrac space=\"2\"><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi><mo>\u2208</mo><mo stretchy=\"false\">[</mo><mn>0</mn><mo>,</mo><mfrac><mn>1</mn><mn>2</mn></mfrac><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. Here <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> is the noise rate the characterizes the strength of the adversary. As is standard in online learning, we study the design of algorithms with small regret over a period of time steps. However, while the algorithm observes corrupted rewards, we require its regret to be small with respect to the true uncorrupted reward distribution. We build upon recent advances in robust estimation for unsupervised learning problems to design robust online algorithms with near optimal regret in  three different scenarios: stochastic multi-armed bandits, linear contextual bandits, and Markov Decision Processes~(MDPs) with stochastic rewards and transitions. Finally, we provide empirical evidence regarding the robustness of our proposed algorithms on synthetic and real datasets.",
+    "title": "Online Learning under Adversarial Corruptions",
+    "authors": [
+      "Pranjal Awasthi",
+      "Sreenivas Gollapudi",
+      "Kostas Kollias",
+      "Apaar Sadhwani"
+    ],
+    "emails": [
+      "~Sreenivas_Gollapudi2",
+      "~Pranjal_Awasthi3",
+      "google.com"
+    ],
+    "rank": 1286
+  },
+  {
+    "url": "https://openreview.net/forum?id=J_pvI6ap5Mn",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.53",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Graph neural networks (GNNs) have been shown with superior performance in various applications, but training dedicated GNNs can be costly for large-scale graphs. Some recent work started to study the pre-training of GNNs. However, none of them provide theoretical insights into the design of their frameworks, or clear requirements and guarantees towards the transferability of GNNs. In this work, we establish a theoretically grounded and practically useful framework for the transfer learning of GNNs. Firstly, we propose a novel view towards the essential graph information and advocate the capturing of it as the goal of transferable GNN training, which motivates the design of EGI (ego-graph information maximization) to analytically achieve this goal. Secondly, we specify the requirement of structure-respecting node features as the GNN input, and conduct a rigorous analysis of GNN transferability based on the difference between the local graph Laplacians of the source and target graphs. Finally, we conduct controlled synthetic experiments to directly justify our theoretical conclusions. Extensive experiments on real-world networks towards role identification show consistent results in the rigorously analyzed setting of direct-transfering (freezing parameters), while those towards large-scale relation prediction show promising results in the more generalized and practical setting of transfering with fine-tuning.",
+    "title": "Transfer Learning of Graph Neural Networks with Ego-graph Information Maximization",
+    "authors": [
+      "Qi Zhu",
+      "Yidan Xu",
+      "Haonan Wang",
+      "Chao Zhang",
+      "Jiawei Han",
+      "Carl Yang"
+    ],
+    "emails": [
+      "~Chao_Zhang9",
+      "~Yidan_Xu1",
+      "~Qi_Zhu7",
+      "~Carl_Yang1",
+      "~Haonan_Wang1",
+      "~Jiawei_Han1"
+    ],
+    "rank": 1287
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ek7qrYhJMbn",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      5,
+      4
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Federated learning has become increasingly important for modern machine learning, especially for data privacy-sensitive scenarios. Existing federated learning mostly adopts the central server-based architecture or centralized architecture. However, in many social network scenarios, centralized federated learning is not applicable (e.g., a central agent or server connecting all users may not exist, or the communication cost to the central server is not affordable). In this paper, we consider a generic setting: 1) the central server may not exist, and 2) the social network is unidirectional or of single-sided trust (i.e., user A trusts user B but user B may not trust user A). We propose a central server free federated learning algorithm, named Online Push-Sum (OPS) method, to handle this challenging but generic scenario. A rigorous regret analysis is also provided, which shows interesting results on how users can benefit from communication with trusted users in the federated learning scenario. This work builds upon the fundamental algorithm framework and theoretical guarantees for federated learning in the generic social network scenario.",
+    "title": "Central Server Free Federated Learning over Single-sided Trust Social Networks",
+    "authors": [
+      "Chaoyang He",
+      "Conghui Tan",
+      "Hanlin Tang",
+      "Shuang Qiu",
+      "Ji Liu"
+    ],
+    "emails": [
+      "~Shuang_Qiu2",
+      "~Chaoyang_He1",
+      "~Conghui_Tan1",
+      "~Ji_Liu1",
+      "ur.rochester.edu"
+    ],
+    "rank": 1288
+  },
+  {
+    "url": "https://openreview.net/forum?id=eZllW0F5aM_",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      5,
+      4
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Graph neural networks have become a staple in problems addressing learning and analysis of data defined over graphs. However, several results suggest an inherent difficulty in extracting better performance by increasing the number of layers. Besides the classic vanishing gradient issues, recent works attribute this to a phenomenon peculiar to the extraction of node features in graph-based tasks, i.e., the need to consider multiple neighborhood sizes at the same time and adaptively tune them. In this paper, we investigate the recently proposed randomly wired architectures in the context of graph neural networks. Instead of building deeper networks by stacking many layers, we prove that employing a randomly-wired architecture can be a more effective way to increase the capacity of the network and obtain richer representations. We show that such architectures behave like an ensemble of paths, which are able to merge contributions from receptive fields of varied size. Moreover, these receptive fields can also be modulated to be wider or narrower through the trainable weights over the paths. We also provide extensive experimental evidence of the superior performance of randomly wired architectures over three tasks and five graph convolution definitions, using a recent benchmarking framework that addresses the reliability of previous testing methodologies.",
+    "title": "Don't stack layers in graph neural networks, wire them randomly",
+    "authors": [
+      "Diego Valsesia",
+      "Giulia Fracastoro",
+      "Enrico Magli"
+    ],
+    "emails": [
+      "~Giulia_Fracastoro1",
+      "~Enrico_Magli1",
+      "~Diego_Valsesia1"
+    ],
+    "rank": 1289
+  },
+  {
+    "url": "https://openreview.net/forum?id=1P2KAvsE59b",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Why over-parameterized neural networks generalize as well as they do is a central concern of theoretical analysis in machine learning today. Following Occam's razor, it has long been suggested that simpler networks generalize better than more complex ones. Successfully quantifying this principle has proved difficult given that many measures of simplicity, such as parameter norms, grow with the size of the network and thus fail to capture the observation that larger networks tend to generalize better in practice.\nIn this paper, we introduce a new, theoretically motivated measure of a network's simplicity: the smallest fraction of the network's parameters that can be kept while pruning without adversely affecting its training loss. We show that this measure is highly predictive of a model's generalization performance across a large set of convolutional networks trained on CIFAR-10. Lastly, we study the mutual information between the predictions of our new measure and strong existing measures based on models' margin, flatness of minima and optimization speed. We show that our new measure is similar to -- but more predictive than -- existing flatness-based measures.",
+    "title": "Robustness to Pruning Predicts Generalization in Deep Neural Networks",
+    "authors": [
+      "Lorenz Kuhn",
+      "Clare Lyle",
+      "Aidan Gomez",
+      "Jonas Rothfuss",
+      "Yarin Gal"
+    ],
+    "emails": [
+      "~Clare_Lyle1",
+      "~Jonas_Rothfuss1",
+      "~Aidan_Gomez1",
+      "~Lorenz_Kuhn1",
+      "~Yarin_Gal1"
+    ],
+    "rank": 1290
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mf4ZSXMZP7",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      6,
+      7
+    ],
+    "rating": "5.53",
+    "confidences": [
+      5,
+      2,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Lately, post-training quantization methods have gained considerable attention, as they are simple to use, and require only a small unlabeled calibration set. This small dataset cannot be used to fine-tune the model without significant over-fitting. Instead, these methods only use the calibration set to set the activations' dynamic ranges. However, such methods always resulted in significant accuracy degradation, when used below 8-bits (except on small datasets). Here we aim to break the 8-bit barrier. To this end, we minimize the quantization errors of each layer separately by optimizing its parameters over the calibration set.  We empirically demonstrate that this approach is: (1) much less susceptible to over-fitting than the standard fine-tuning approaches, and can be used even on a very small calibration set; and (2) more powerful than previous methods, which only set the activations' dynamic ranges. Furthermore, we demonstrate how to optimally allocate the bit-widths for each layer, while constraining accuracy degradation or model compression by proposing a novel integer programming formulation. Finally, we suggest model global statistics tuning, to correct biases introduced during quantization. Together, these methods yield state-of-the-art results for both vision and text models. For instance, on ResNet50, we obtain less than 1\\% accuracy degradation --- with 4-bit weights and activations in all layers, but the smallest two. Our code is available at, <a href=\"https://github.com/papers-submission/CalibTIP\" target=\"_blank\" rel=\"nofollow\">https://github.com/papers-submission/CalibTIP</a>",
+    "title": "Improving Post Training Neural Quantization: Layer-wise Calibration and Integer Programming",
+    "authors": [
+      "Itay Hubara",
+      "Yury Nahshan",
+      "Yair Hanani",
+      "Ron Banner",
+      "Daniel Soudry"
+    ],
+    "emails": [
+      "~Yair_Hanani1",
+      "~Ron_Banner1",
+      "~Yury_Nahshan1",
+      "~Daniel_Soudry1",
+      "~Itay_Hubara1"
+    ],
+    "rank": 1291
+  },
+  {
+    "url": "https://openreview.net/forum?id=7wCBOfJ8hJM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      8,
+      4,
+      6
+    ],
+    "rating": "5.53",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "We introduce <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-nearest-neighbor machine translation (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>NN-MT), which predicts tokens with a nearest-neighbor classifier over a large datastore of cached examples, using representations from a neural translation model for similarity search. This approach requires no additional training and scales to give the decoder direct access to billions of examples at test time, resulting in a highly expressive model that consistently improves performance across many settings. Simply adding nearest-neighbor search improves a state-of-the-art German-English translation model by 1.5 BLEU. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>NN-MT allows a single model to be adapted to diverse domains by using a domain-specific datastore, improving results by an average of 9.2 BLEU over zero-shot transfer, and achieving new state-of-the-art results---without training on these domains. A massively multilingual model can also be specialized for particular language pairs, with improvements of 3 BLEU for translating from English into German and Chinese. Qualitatively, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>NN-MT is easily interpretable; it combines source and target context to retrieve highly relevant examples.",
+    "title": "Nearest Neighbor Machine Translation",
+    "authors": [
+      "Urvashi Khandelwal",
+      "Angela Fan",
+      "Dan Jurafsky",
+      "Luke Zettlemoyer",
+      "Mike Lewis"
+    ],
+    "emails": [
+      "~Angela_Fan2",
+      "~Urvashi_Khandelwal1",
+      "~Mike_Lewis1",
+      "~Luke_Zettlemoyer1",
+      "~Dan_Jurafsky1"
+    ],
+    "rank": 1292
+  },
+  {
+    "url": "https://openreview.net/forum?id=5JnS8wROG9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Training overparameterized convolutional neural networks with gradient based optimization is the most successful learning method for image classification. However, their generalization properties are far from understood. In this work, we consider a simplified image classification task where images contain orthogonal patches and are learned with a 3-layer overparameterized convolutional network and stochastic gradient descent (SGD). We empirically identify a novel phenomenon of SGD in our setting, where the dot-product between the learned pattern detectors and their detected patterns are governed by the pattern statistics in the training set. We call this phenomenon Pattern Statistics Inductive Bias (PSI) and empirically verify it in a large number of instances. We prove that in our setting, if a learning algorithm satisfies PSI then its sample complexity is <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>d</mi><mn>2</mn></msup><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mi>d</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> is the filter dimension. In contrast, we show a VC dimension lower bound which is exponential in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container>. We perform experiments with overparameterized CNNs on a variant of MNIST with non-orthogonal patches, and show that the empirical observations are in line with our analysis.",
+    "title": "On the Inductive Bias of a CNN for Distributions with Orthogonal Patterns",
+    "authors": [
+      "Alon Brutzkus",
+      "Amir Globerson"
+    ],
+    "emails": [
+      "~Amir_Globerson1",
+      "~Alon_Brutzkus1"
+    ],
+    "rank": 1293
+  },
+  {
+    "url": "https://openreview.net/forum?id=jNhWDHdjVi4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We develop a new framework for learning variational autoencoders and other deep generative models that balances generative and discriminative goals. Our framework optimizes model parameters to maximize a variational lower bound on the likelihood of observed data, subject to a task-specific prediction constraint that prevents model misspecification from leading to inaccurate predictions. We further enforce a consistency constraint, derived naturally from the generative model, that requires predictions on reconstructed data to match those on the original data. We show that these two contributions -- prediction constraints and consistency constraints -- lead to promising image classification performance, especially in the semi-supervised scenario where category labels are sparse but unlabeled data is plentiful. Our approach enables advances in generative modeling to directly boost semi-supervised classification performance, an ability we demonstrate by augmenting deep generative models with latent variables capturing spatial transformations. ",
+    "title": "Learning Consistent Deep Generative Models from Sparse Data via Prediction Constraints",
+    "authors": [
+      "Gabriel Hope",
+      "Madina Abdrakhmanova",
+      "Xiaoyin Chen",
+      "Michael C Hughes",
+      "Erik B Sudderth"
+    ],
+    "emails": [
+      "~Erik_B_Sudderth1",
+      "~Michael_C_Hughes1",
+      "uci.edu",
+      "~Gabriel_Hope1",
+      "~Madina_Abdrakhmanova1"
+    ],
+    "rank": 1294
+  },
+  {
+    "url": "https://openreview.net/forum?id=kmBFHJ5pr0o",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      8,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Current deep neural networks are vulnerable to adversarial attacks, where adversarial perturbations to the inputs can change or manipulate classification. To defend against such attacks, an effective and popular approach, known as adversarial training, has been shown to mitigate the negative impact of adversarial attacks by virtue of a min-max robust training method. While effective, this approach is difficult to scale well to large models on large datasets (e.g., ImageNet) in general. To address this challenge, we propose distributed adversarial training (DAT), a large-batch adversarial training framework implemented over multiple machines. DAT supports one-shot and iterative attack generation methods, gradient quantization, and training over labeled and unlabeled data. Theoretically, we provide, under standard conditions in the optimization theory, the convergence rate of DAT to the first-order stationary points in general non-convex settings.  Empirically, on ResNet-18 and -50 under CIFAR-10 and ImageNet, we demonstrate that DAT either matches or outperforms state-of-the-art robust accuracies and achieves a graceful training speedup.",
+    "title": "Distributed Adversarial Training to Robustify Deep Neural Networks at Scale",
+    "authors": [
+      "Gaoyuan Zhang",
+      "Songtao Lu",
+      "Sijia Liu",
+      "Xiangyi Chen",
+      "Pin-Yu Chen",
+      "Lee Martie",
+      "Lior Horesh",
+      "Mingyi Hong"
+    ],
+    "emails": [
+      "~Gaoyuan_Zhang1",
+      "~Songtao_Lu1",
+      "~Pin-Yu_Chen1",
+      "~Lior_Horesh1",
+      "~Xiangyi_Chen1",
+      "~Mingyi_Hong1",
+      "ibm.com",
+      "~Sijia_Liu1"
+    ],
+    "rank": 1295
+  },
+  {
+    "url": "https://openreview.net/forum?id=gW8n0uD6rl",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Standard causal discovery methods must fit a new model whenever they encounter samples from a new underlying causal graph. However, these samples often share relevant information - for instance, the dynamics describing the effects of causal relations - which is lost when following this approach. We propose Amortized Causal Discovery, a novel framework that leverages such shared dynamics to learn to infer causal relations from time-series data. This enables us to train a single, amortized model that infers causal relations across samples with different underlying causal graphs, and thus makes use of the information that is shared. We demonstrate experimentally that this approach, implemented as a variational model, leads to significant improvements in causal discovery performance, and show how it can be extended to perform well under hidden confounding.\n",
+    "title": "Amortized Causal Discovery: Learning to Infer Causal Graphs from Time-Series Data",
+    "authors": [
+      "Sindy L\u00f6we",
+      "David Madras",
+      "Richard Zemel",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Sindy_L\u00f6we1",
+      "~David_Madras1",
+      "~Max_Welling1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 1296
+  },
+  {
+    "url": "https://openreview.net/forum?id=ce6CFXBh30h",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "While existing federated learning approaches mostly require that clients have fully-labeled data to train on, in realistic settings, data obtained at the client-side often comes without any accompanying labels. Such deficiency of labels may result from either high labeling cost, or difficulty of annotation due to the requirement of expert knowledge. Thus the private data at each client may be either partly labeled, or completely unlabeled with labeled data being available only at the server, which leads us to a new practical federated learning problem, namely Federated Semi-Supervised Learning (FSSL). In this work, we study two essential scenarios of FSSL based on the location of the labeled data. The first scenario considers a conventional case where clients have both labeled and unlabeled data (labels-at-client), and the second scenario considers a more challenging case, where the labeled data is only available at the server (labels-at-server). We then propose a novel method to tackle the problems, which we refer to as Federated Matching (FedMatch). FedMatch improves upon naive combinations of federated learning and semi-supervised learning approaches with a new inter-client consistency loss and decomposition of the parameters for disjoint learning on labeled and unlabeled data. Through extensive experimental validation of our method in the two different scenarios, we show that our method outperforms both local semi-supervised learning and baselines which naively combine federated learning with semi-supervised learning.",
+    "title": "Federated Semi-Supervised Learning with Inter-Client Consistency & Disjoint Learning",
+    "authors": [
+      "Wonyong Jeong",
+      "Jaehong Yoon",
+      "Eunho Yang",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Eunho_Yang1",
+      "~Sung_Ju_Hwang1",
+      "~Wonyong_Jeong1",
+      "~Jaehong_Yoon1"
+    ],
+    "rank": 1297
+  },
+  {
+    "url": "https://openreview.net/forum?id=eyXknI5scWu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      2,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Saliency maps that identify the most informative regions of an image for a classifier are valuable for model interpretability. A common approach to creating saliency maps involves generating input masks that mask out portions of an image to maximally deteriorate classification performance, or mask in an image to preserve classification performance. Many variants of this approach have been proposed in the literature, such as counterfactual generation and optimizing over a Gumbel-Softmax distribution. Using a general formulation of masking-based saliency methods, we conduct an extensive evaluation study of a number of recently proposed variants to understand which elements of these methods meaningfully improve performance. Surprisingly, we find that a well-tuned, relatively simple formulation of a masking-based saliency model outperforms many more complex approaches. We find that the most important ingredients for high quality saliency map generation are (1) using both masked-in and masked-out objectives and (2) training the classifier alongside the masking model. Strikingly, we show that a masking model can be trained with as few as 10 examples per class and still generate saliency maps with only a 0.7-point increase in localization error.",
+    "title": "Investigating and Simplifying Masking-based Saliency Methods for Model Interpretability",
+    "authors": [
+      "Jason Phang",
+      "Jungkyu Park",
+      "Krzysztof J. Geras"
+    ],
+    "emails": [
+      "~Krzysztof_J._Geras1",
+      "~Jungkyu_Park1",
+      "~Jason_Phang1"
+    ],
+    "rank": 1298
+  },
+  {
+    "url": "https://openreview.net/forum?id=eom0IUrF__F",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Dialogue state trackers have made significant progress on benchmark datasets, but their generalization capability to novel and realistic scenarios beyond the held- out conversations is less understood. We propose controllable counterfactuals (COCO) to bridge this gap and evaluate dialogue state tracking (DST) models on novel scenarios, i.e., would the system successfully tackle the request if the user responded differently but still consistently with the dialogue flow? COCO leverages turn-level belief states as counterfactual conditionals to produce novel conversation scenarios in two steps: (i) counterfactual goal generation at turn- level by dropping and adding slots followed by replacing slot values, (ii) counterfactual conversation generation that is conditioned on (i) and consistent with the dialogue flow. Evaluating state-of-the-art DST models on MultiWOZ dataset with COCO-generated counterfactuals results in a significant performance drop of up to 30.8% (from 49.4% to 18.6%) in absolute joint goal accuracy. In comparison, widely used techniques like paraphrasing only affect the accuracy by at most 2%. Human evaluations show that COCO-generated conversations perfectly reflect the underlying user goal with more than 95% accuracy and are as human-like as the original conversations, further strengthening its reliability and promise to be adopted as part of the robustness evaluation of DST models.",
+    "title": "CoCo: Controllable Counterfactuals for Evaluating Dialogue State Trackers",
+    "authors": [
+      "SHIYANG LI",
+      "Semih Yavuz",
+      "Kazuma Hashimoto",
+      "Jia Li",
+      "Tong Niu",
+      "Nazneen Rajani",
+      "Xifeng Yan",
+      "Yingbo Zhou",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Yingbo_Zhou1",
+      "salesforce.com",
+      "~Nazneen_Rajani1",
+      "~Caiming_Xiong1",
+      "~Kazuma_Hashimoto1",
+      "~SHIYANG_LI2",
+      "~Jia_Li8",
+      "~Semih_Yavuz1",
+      "~Xifeng_Yan1"
+    ],
+    "rank": 1299
+  },
+  {
+    "url": "https://openreview.net/forum?id=t4EWDRLHwcZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Graph learning plays important role in many data mining and machine learning tasks, such as manifold learning, data representation and analysis, dimensionality reduction, data clustering, and visualization, etc. For the first time, we present a highly-scalable spectral graph densification approach (GRASPEL) for graph learning from data. By limiting the precision matrix to be a graph-Laplacian-like matrix in graphical Lasso, our approach aims to learn ultra-sparse undirected graphs from potentially high-dimensional input data. A very unique property of the graphs learned by GRASPEL is that the spectral embedding (or approximate effective-resistance) distances on the graph will encode the similarities between the original input data points. By interleaving the latest high-performance nearly-linear\ntime spectral methods, ultrasparse yet spectrally-robust graphs can be learned by identifying and including the most spectrally-critical edges into the graph. Compared with prior state-of-the-art graph learning approaches, GRASPEL is more scalable and allows substantially improving computing efficiency and solution quality of a variety of data mining and\nmachine learning applications, such as manifold learning, spectral clustering (SC), and dimensionality reduction.",
+    "title": "Graph Learning via Spectral Densification",
+    "authors": [
+      "Zhuo Feng",
+      "Yongyu Wang",
+      "Zhiqiang Zhao"
+    ],
+    "emails": [
+      "~Yongyu_Wang1",
+      "~Zhuo_Feng3",
+      "~Zhiqiang_Zhao1"
+    ],
+    "rank": 1300
+  },
+  {
+    "url": "https://openreview.net/forum?id=GKLLd9FOe5l",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      3,
+      7
+    ],
+    "rating": "5.50",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Online A/B testing plays a critical role in high-tech industry to guide product development and accelerate innovation. It performs a null hypothesis statistical test to determine which variant is better. However, a typical A/B test presents two problems: (i) a fixed-horizon framework inflates the false positive errors under continuous monitoring; (ii) the homogeneous effects assumption fails to identify a subgroup with a beneficial treatment effect. In this paper, we propose a sequential test for subgroup treatment effects based on value difference, named SUBTLE, to address these two problems simultaneously. The SUBTLE allows the experimenters to \"peek\" the results during the experiment without harming the statistical guarantees. It assumes heterogeneous treatment effects and aims to test if some subgroup of the population will benefit from the investigative treatment. If the testing result indicates the existence of such subgroup, a subgroup will be identified using a readily available estimated optimal treatment rule. We examine the empirical performance of our proposed test on both simulations and a real data set. The results show that the SUBTLE has high detection power with controlled type I error at any time, is more robust to noise covariates, and can achieve early stopping compared with the corresponding fixed-horizon test.",
+    "title": "Online Testing of Subgroup Treatment Effects Based on Value Difference",
+    "authors": [
+      "Miao Yu",
+      "Wenbin Lu",
+      "Rui Song"
+    ],
+    "emails": [
+      "~Rui_Song2",
+      "~Miao_Yu5",
+      "~Wenbin_Lu1"
+    ],
+    "rank": 1301
+  },
+  {
+    "url": "https://openreview.net/forum?id=jfPU-u_52Tx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper introduces Distributed Stein Variational Gradient Descent (DSVGD), a non-parametric generalized Bayesian inference framework for federated learning. DSVGD maintains a number of non-random and interacting particles at a central server to represent the current iterate of the model global posterior. The particles are iteratively downloaded and updated by one of the agents with the end goal of minimizing the global free energy. By varying the number of particles, DSVGD enables a flexible trade-off between per-iteration communication load and number of communication rounds. DSVGD is shown to compare favorably to benchmark frequentist and Bayesian federated learning strategies, also scheduling a single device per iteration, in terms of accuracy and scalability with respect to the number of agents, while also providing well-calibrated, and hence trustworthy, predictions.",
+    "title": "Federated Generalized Bayesian Learning via  Distributed Stein Variational Gradient Descent",
+    "authors": [
+      "Rahif Kassab",
+      "Osvaldo Simeone"
+    ],
+    "emails": [
+      "~Osvaldo_Simeone1",
+      "~Rahif_Kassab1"
+    ],
+    "rank": 1302
+  },
+  {
+    "url": "https://openreview.net/forum?id=akgiLNAkC7P",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Standard reinforcement learning (RL) algorithms train agents to maximize given reward functions. However, many real-world applications of RL require agents to also satisfy certain constraints which may, for example, be motivated by safety concerns. Constrained RL algorithms approach this problem by training agents to maximize given reward functions while respecting \\textit{explicitly} defined constraints. However, in many cases, manually designing accurate constraints is a challenging task. In this work, given a reward function and a set of demonstrations from an expert that maximizes this reward function while respecting \\textit{unknown} constraints, we propose a framework to learn the most likely constraints that the expert respects. We then train agents to maximize the given reward function subject to the learned constraints. Previous works in this regard have either mainly been restricted to tabular settings or specific types of constraints or assume knowledge of transition dynamics of the environment. In contrast, we empirically show that our framework is able to learn arbitrary \\textit{Markovian} constraints in high-dimensions in a model-free setting.",
+    "title": "Inverse Constrained Reinforcement Learning",
+    "authors": [
+      "Shehryar Malik",
+      "Usman Anwar",
+      "Alireza Aghasi",
+      "Ali Ahmed"
+    ],
+    "emails": [
+      "~Shehryar_Malik1",
+      "~Alireza_Aghasi2",
+      "itu.edu.pk",
+      "~Ali_Ahmed1"
+    ],
+    "rank": 1303
+  },
+  {
+    "url": "https://openreview.net/forum?id=cxRUccyjw0S",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Recent approaches for unsupervised image translation are strongly reliant on generative adversarial training and architectural locality constraints. Despite their appealing results, it can be easily observed that the learned class and content representations are entangled which often hurts the translation performance. To this end, we propose OverLORD, for learning disentangled representations for the image class and attributes, utilizing latent optimization and carefully designed content and style bottlenecks. We further argue that the commonly used adversarial optimization can be decoupled from representation disentanglement and be applied at a later stage of the training to increase the perceptual quality of the generated images. Based on these principles, our model learns significantly more disentangled representations and achieves higher translation quality and greater output diversity than state-of-the-art methods.",
+    "title": "Learning Disentangled Representations for Image Translation",
+    "authors": [
+      "Aviv Gabbay",
+      "Yedid Hoshen"
+    ],
+    "emails": [
+      "~Aviv_Gabbay1",
+      "~Yedid_Hoshen3"
+    ],
+    "rank": 1304
+  },
+  {
+    "url": "https://openreview.net/forum?id=l47UCAewjy",
+    "ratings": [
+      5,
+      8,
+      5,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper aims to evaluate and analyze the complexity of feature transformations encoded in DNNs. We propose metrics to measure three types of complexity of transformations based on the information theory. We further discover and prove the negative correlation between the complexity and the disentanglement of transformations. Based on the proposed metrics, we analyze two typical phenomena of the change of the transformation complexity during the training process, and explore the ceiling of a DNN\u2019s complexity. The proposed metrics can also be used as a loss to learn a DNN with the minimum complexity, which also controls the significance of over-fitting of the DNN. Comprehensive comparative studies have provided new perspectives to understand the DNN. We will release the code when the paper is accepted.",
+    "title": "Understanding, Analyzing, and Optimizing the Complexity of Deep Models",
+    "authors": [
+      "Jie Ren",
+      "Mingjie Li",
+      "Meng Zhou",
+      "Shih-Han Chan",
+      "Zexu Liu",
+      "Quanshi Zhang"
+    ],
+    "emails": [
+      "~Quanshi_Zhang1",
+      "~Zexu_Liu2",
+      "~Mingjie_Li3",
+      "~Jie_Ren1",
+      "~Meng_Zhou2",
+      "~Shih-Han_Chan1"
+    ],
+    "rank": 1305
+  },
+  {
+    "url": "https://openreview.net/forum?id=w5uur-ZwCXn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Transfer learning has yielded state-of-the-art (SoTA) results in many supervised NLP tasks. However,  annotated data for every target task in every target language is rare, especially for low-resource languages. We propose XLA, a novel data augmentation framework for self-supervised learning in zero-resource transfer learning scenarios. In particular, XLA aims to solve cross-lingual adaptation problems from a source language task distribution to an unknown target language task distribution, assuming no training label in the target language task. At its core, XLA performs simultaneous self-training with data augmentation and unsupervised sample selection. To show its effectiveness, we conduct extensive experiments on zero-resource cross-lingual transfer tasks for Named Entity Recognition (NER), Natural Language Inference (NLI) and paraphrase identification on Paraphrase Adversaries from Word Scrambling (PAWS). XLA achieves SoTA results in all the tasks, outperforming the baselines by a good margin. With an in-depth framework dissection, we demonstrate the cumulative contributions of different components to XLA's success. ",
+    "title": "XLA: A Robust Unsupervised Data Augmentation Framework for Cross-Lingual NLP",
+    "authors": [
+      "M Saiful Bari",
+      "Tasnim Mohiuddin",
+      "Shafiq Joty"
+    ],
+    "emails": [
+      "~M_Saiful_Bari2",
+      "~Shafiq_Joty1",
+      "~Tasnim_Mohiuddin1"
+    ],
+    "rank": 1306
+  },
+  {
+    "url": "https://openreview.net/forum?id=uUAuBTcIIwq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We present a novel deep generative model based on non i.i.d. variational autoencoders that captures global dependencies among observations in a fully unsupervised fashion. In contrast to the recent semi-supervised alternatives for global modeling in deep generative models, our approach combines a mixture model in the local or data-dependent space and a global Gaussian latent variable, which lead us to obtain three particular insights. First, the induced latent global space captures interpretable disentangled representations with no user-defined regularization in the evidence lower bound (as in beta-VAE and its generalizations). Second, we show that the model performs domain alignment to find correlations and interpolate between different databases. Finally, we study the ability of the global space to discriminate between groups of observations with non-trivial underlying structures, such as face images with shared attributes or defined sequences of digits images.",
+    "title": "Unsupervised Learning of Global Factors in Deep Generative Models",
+    "authors": [
+      "Ignacio Peis",
+      "Pablo M. Olmos",
+      "Antonio Art\u00e9s"
+    ],
+    "emails": [
+      "~Pablo_M._Olmos1",
+      "~Antonio_Art\u00e9s1",
+      "~Ignacio_Peis1"
+    ],
+    "rank": 1307
+  },
+  {
+    "url": "https://openreview.net/forum?id=yBJihVXahXc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Graph convolutional networks (GCNs) have emerged as a powerful framework for mining and learning with graphs. A recent study shows that GCNs can be simplified as a linear model by removing nonlinearities and weight matrices across all consecutive layers, resulting the simple graph convolution (SGC) model. In this paper, we aim to understand GCNs and generalize SGC as a linear model via heat kernel (HKGCN), which acts as a low-pass filter on graphs and enables the aggregation of information from extremely large receptive fields. We theoretically show that HKGCN is in nature a continuous propagation model and GCNs without nonlinearities (i.e., SGC) are the discrete versions of it. Its low-pass filter and continuity properties facilitate the fast and smooth convergence of feature propagation. Experiments on million-scale networks show that the linear HKGCN model not only achieves consistently better results than SGC but also can match or even beat advanced GCN models, while maintaining SGC\u2019s superiority in efficiency.",
+    "title": "Generalizing Graph Convolutional Networks via Heat Kernel",
+    "authors": [
+      "Jialin Zhao",
+      "Yuxiao Dong",
+      "Jie Tang",
+      "Ming Ding",
+      "Kuansan Wang"
+    ],
+    "emails": [
+      "~Jialin_Zhao1",
+      "mails.tsinghua.edu.cn",
+      "~Jie_Tang1",
+      "~Yuxiao_Dong1",
+      "~Kuansan_Wang1"
+    ],
+    "rank": 1308
+  },
+  {
+    "url": "https://openreview.net/forum?id=ClZ4IcqnFXB",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many real-world situations allow for the acquisition of additional relevant information when making an assessment with limited or uncertain data. However, traditional ML approaches either require all features to be acquired beforehand or regard part of them as missing data that cannot be acquired. In this work, we propose models that perform active feature acquisition (AFA) to improve the prediction assessments at evaluation time. We formulate the AFA problem as a Markov decision process (MDP) and resolve it using reinforcement learning (RL). The AFA problem yields sparse rewards and contains a high-dimensional complicated action space. Thus, we propose learning a generative surrogate model that captures the complicated dependencies among input features to assess potential information gain from acquisitions. We also leverage the generative surrogate model to provide intermediate rewards and auxiliary information to the agent. Furthermore, we extend AFA in a task we coin active instance recognition (AIR) for the unsupervised case where the target variables are the unobserved features themselves and the goal is to collect information for a particular instance in a cost-efficient way. Empirical results demonstrate that our approach achieves considerably better performance than previous state of the art methods on both supervised and unsupervised tasks.",
+    "title": "Active Feature Acquisition with Generative Surrogate Models",
+    "authors": [
+      "Yang Li",
+      "Junier Oliva"
+    ],
+    "emails": [
+      "~Junier_Oliva1",
+      "~Yang_Li19"
+    ],
+    "rank": 1309
+  },
+  {
+    "url": "https://openreview.net/forum?id=QfEssgaXpm",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Reinforcement learning is promising to control dynamical systems for which the traditional control methods are hardly applicable. However, in control theory, the stability of a closed-loop system can be hardly guaranteed using the policy/controller learned solely from samples. In this paper, we will combine Lyapunov's method in control theory and stochastic analysis to analyze the mean square stability of MDP in a model-free manner. Furthermore, the finite sample bounds on the probability of stability are derived as a function of the number M and length T of the sampled trajectories. And we show that there is a lower bound on T and the probability is much more demanding for M than T. Based on the theoretical results, a REINFORCE like algorithm is proposed to learn the controller and the Lyapunov function simultaneously. ",
+    "title": "Reinforcement Learning for Control with Probabilistic Stability Guarantee",
+    "authors": [
+      "Minghao Han",
+      "Zhipeng Zhou",
+      "Lixian Zhang",
+      "Jun Wang",
+      "Wei Pan"
+    ],
+    "emails": [
+      "~Minghao_Han2",
+      "~Wei_Pan2",
+      "hit.edu.cn",
+      "~Zhipeng_Zhou3",
+      "~Jun_Wang2"
+    ],
+    "rank": 1310
+  },
+  {
+    "url": "https://openreview.net/forum?id=Px7xIKHjmMS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      5,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Despite their popularity in learning problems over graph structured data, existing Graph Neural Networks (GNNs) have inherent limitations for fundamental graph problems such as shortest paths, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-connectivity, minimum spanning tree and minimum cuts. In all these instances, it is known that one needs GNNs of high depth, scaling at a polynomial rate with the number of nodes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>, to provably encode the solution space. This in turn affects their statistical efficiency thus requiring a significant amount of training data in order to obtain networks with good performance. In this work we propose a new hybrid architecture to overcome this limitation. Our proposed architecture that we call as GNNplus networks involve a combination of multiple parallel low depth GNNs along with simple pooling layers involving low depth fully connected networks. We provably demonstrate that for many graph problems, the solution space can be encoded by GNNplus networks using depth that scales only poly-logarithmically in the number of nodes. This significantly improves the amount of training data needed that we establish via improved generalization bounds. Finally, we empirically demonstrate the effectiveness of our proposed architecture for a variety of graph problems.\n",
+    "title": "Beyond GNNs: A Sample Efficient Architecture for Graph Problems",
+    "authors": [
+      "Pranjal Awasthi",
+      "Abhimanyu Das",
+      "Sreenivas Gollapudi"
+    ],
+    "emails": [
+      "~Sreenivas_Gollapudi2",
+      "~Pranjal_Awasthi3",
+      "google.com"
+    ],
+    "rank": 1311
+  },
+  {
+    "url": "https://openreview.net/forum?id=OjUsDdCpR5",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Covariance estimation on high-dimensional data is a central challenge across multiple scientific disciplines. Sparse high-dimensional count data, frequently encountered in biological applications such as DNA sequencing and proteomics, are often well modeled using multinomial logistic normal models.  In many cases, these datasets are also compositional, presented item-wise as fractions of a normalized total, due to measurement and instrument constraints. In compositional settings, three key factors limit the ability of these models to estimate covariance: (1) the computational complexity of inverting high-dimensional covariance matrices, (2) the non-exchangeability introduced from the summation constraint on multinomial parameters, and (3) the irreducibility of the component multinomial logistic normal distribution that necessitates the use of parameter augmentation, or similar techniques, during inference. We show that a variational autoencoder augmented with a fast isometric log-ratio (ILR) transform can address these issues and accurately estimate principal components from multinomially logistic normal distributed data.\nThis model can be optimized on GPUs and modified to handle mini-batching, with the ability to scale across thousands of dimensions and thousands of samples.",
+    "title": "Inferring Principal Components in the Simplex with Multinomial Variational Autoencoders",
+    "authors": [
+      "James Morton",
+      "Justin Silverman",
+      "Gleb Tikhonov",
+      "Harri L\u00e4hdesm\u00e4ki",
+      "Rich Bonneau"
+    ],
+    "emails": [
+      "flatironinstitute.org",
+      "gmail.com",
+      "aalto.fi",
+      "~James_Morton1",
+      "~Harri_L\u00e4hdesm\u00e4ki1"
+    ],
+    "rank": 1312
+  },
+  {
+    "url": "https://openreview.net/forum?id=ErrNJYcVRmS",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      4,
+      2
+    ],
+    "abstract": "In this paper, we present F^2ed-Learning, the first federated learning protocol simultaneously defending against both semi-honest server and Byzantine malicious clients. Using a robust mean estimator called FilterL2, F^2ed-Learning is the first FL protocol with dimension-free estimation error against Byzantine malicious clients. Besides, F^2ed-Learning leverages secure aggregation to protect the clients from a semi-honest server who wants to infer the clients' information from the legitimate updates. The main challenge stems from the incompatibility between FilterL2 and secure aggregation. Specifically, to run FilterL2, the server needs to access individual updates from clients while secure aggregation hides those updates from it. We propose to split the clients into shards, securely aggregate each shard's updates and run FilterL2 on the updates from different shards. The evaluation shows that F^2ed-Learning consistently achieves optimal or sub-optimal performance under three attacks among five robust FL protocols. The code for evaluation is available in the supplementary material.",
+    "title": "F^2ed-Learning: Good Fences Make Good Neighbors",
+    "authors": [
+      "Lun Wang",
+      "Qi Pang",
+      "Shuai Wang",
+      "Dawn Song"
+    ],
+    "emails": [
+      "~Dawn_Song1",
+      "~Lun_Wang1",
+      "~Shuai_Wang7",
+      "~Qi_Pang1"
+    ],
+    "rank": 1313
+  },
+  {
+    "url": "https://openreview.net/forum?id=xCy9thPPTb_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": " Neural networks are popular and useful in many fields, but they have the problem of giving high confidence responses for examples that are away from the training data. This results in the neural networks being very confident while making gross mistakes, thus limiting their reliability for safety critical applications such as autonomous driving, space exploration, etc.\nIn this paper, we present a more generic neuron formulation that contains the standard dot-product based neuron and the RBF neuron as two extreme cases of a shape parameter. Using ReLU as the activation function we obtain a novel neuron that  compact support, which means its output is zero outside a bounded domain. We also show how to avoid difficulties in training a neural network with such neurons, by starting with a trained standard neural network and gradually increasing the shape parameter to the desired value. \nThrough experiments on standard benchmark datasets, we show the promise of the proposed approach, in that it can have good prediction on in-distribution samples, and it can consistently detect and have low confidence on out of distribution samples.",
+    "title": "The Compact Support Neural Network",
+    "authors": [
+      "Adrian Barbu",
+      "Hongyu Mou"
+    ],
+    "emails": [
+      "~Adrian_Barbu1",
+      "my.fsu.edu"
+    ],
+    "rank": 1314
+  },
+  {
+    "url": "https://openreview.net/forum?id=3jJKpFbLkU2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "While deep neural networks provide good performance for a range of challenging tasks, calibration and uncertainty estimation remain major challenges. In this paper, we propose the amortized conditional normalized maximum likelihood (ACNML)\nmethod as a scalable general-purpose approach for uncertainty estimation, calibration, and out-of-distribution robustness with deep networks. Our algorithm builds on the conditional normalized maximum likelihood (CNML) coding scheme, which has minimax optimal properties according to the minimum description length principle, but is computationally intractable to evaluate exactly for all but the simplest of model classes. We propose to use approximate Bayesian inference technqiues to produce a tractable approximation to the CNML distribution. Our approach can be combined with any approximate inference algorithm that provides tractable posterior densities over model parameters. We demonstrate that ACNML compares favorably to a number of prior techniques for uncertainty estimation in terms of and calibration on out-of-distribution inputs.",
+    "title": "Amortized Conditional Normalized Maximum Likelihood",
+    "authors": [
+      "Aurick Zhou",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Sergey_Levine1",
+      "~Aurick_Zhou1"
+    ],
+    "rank": 1315
+  },
+  {
+    "url": "https://openreview.net/forum?id=71zCSP_HuBN",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We develop an algorithm to train individually fair learning-to-rank (LTR) models. The proposed approach ensures items from minority groups appear alongside similar items from majority groups. This notion of fair ranking is based on the definition of individual fairness from supervised learning and is more nuanced than prior fair LTR approaches that simply ensure the ranking model provides underrepresented items with a basic level of exposure. The crux of our method is an optimal transport-based regularizer that enforces individual fairness and an efficient algorithm for optimizing the regularizer. We show that our approach leads to certifiably individually fair LTR models and demonstrate the efficacy of our method on ranking tasks subject to demographic biases.",
+    "title": "Individually Fair Rankings",
+    "authors": [
+      "Amanda Bower",
+      "Hamid Eftekhari",
+      "Mikhail Yurochkin",
+      "Yuekai Sun"
+    ],
+    "emails": [
+      "~Amanda_Bower1",
+      "umich.edu",
+      "~Mikhail_Yurochkin1",
+      "~Yuekai_Sun1"
+    ],
+    "rank": 1316
+  },
+  {
+    "url": "https://openreview.net/forum?id=JeweO9-QqV-",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "We introduce a second-order graph convolution (SoGC), a maximally localized kernel, that can express a polynomial spectral filter with arbitrary coefficients.  We contrast our SoGC with vanilla GCN, first-order (one-hop) aggregation, and higher-order (multi-hop) aggregation by analyzing graph convolutional layers via generalized filter space.  We argue that SoGC is a simple design capable of forming the basic building block of graph convolution, playing the same role as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mo>\u00d7</mo><mn>3</mn></math></mjx-assistive-mml></mjx-container> kernels in CNNs.   We build purely topological Second-Order Graph Convolutional Networks (SoGCN) and demonstrate that SoGCN consistently achieves state-of-the-art performance on the latest benchmark. Moreover, we introduce the Gated Recurrent Unit (GRU) to spectral GCNs. This explorative attempt further improves our experimental results.",
+    "title": "SoGCN: Second-Order Graph Convolutional Networks",
+    "authors": [
+      "Peihao Wang",
+      "Yuehao Wang",
+      "Hua Lin",
+      "Jianbo Shi"
+    ],
+    "emails": [
+      "~Yuehao_Wang1",
+      "~Hua_Lin1",
+      "~Peihao_Wang1",
+      "~Jianbo_Shi1"
+    ],
+    "rank": 1317
+  },
+  {
+    "url": "https://openreview.net/forum?id=w6Vm1Vob0-X",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      1,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks (GNNs) have been extensively studied for prediction tasks on graphs. Most GNNs assume local homophily, i.e., strong similarities in local neighborhoods. This assumption limits the generalizability of GNNs, which has been demonstrated by recent work on disassortative graphs with weak local homophily. In this paper, we argue that GNN's feature aggregation scheme can be made flexible and adaptive to data without the assumption of local homophily. To demonstrate, we propose a GNN model with a global self-attention mechanism defined using learnable spectral filters, which can attend to any nodes, regardless of distance. We evaluated the proposed model on node classification tasks over six benchmark datasets. The proposed model has been shown to generalize well to both assortative and disassortative graphs. Further, it outperforms all state-of-the-art baselines on disassortative graphs and performs comparably with them on assortative graphs. ",
+    "title": "Global Node Attentions via Adaptive Spectral Filters",
+    "authors": [
+      "Shouheng Li",
+      "Dongwoo Kim",
+      "Qing Wang"
+    ],
+    "emails": [
+      "~Shouheng_Li1",
+      "~Dongwoo_Kim1",
+      "anu.edu.au"
+    ],
+    "rank": 1318
+  },
+  {
+    "url": "https://openreview.net/forum?id=TCAmP8zKZ6k",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Dialogue system for medical automatic diagnosis (DSMAD) aims to learn an agent that mimics the behavior of a human doctor, i.e. inquiring symptoms and informing diseases. Since DSMAD has been formulated as a Markov decision-making process, many studies apply reinforcement learning methods to solve it. Unfortunately, existing works solely rely on simple diagnostic accuracy to justify the effectiveness of their DSMAD agents while ignoring the medical rationality of the inquiring process. From the perspective of medical application, it's critical to develop an agent that is able to produce reliable and convincing diagnosing processes and also is robust in making diagnosis facing noisy interaction with patients. To this end, we propose a novel DSMAD agent, INS-DS (Introspective Diagnosis System) comprising of two separate yet cooperative modules, i.e., an inquiry module for proposing symptom-inquiries and an introspective module for deciding when to inform a disease. INS-DS is inspired by the introspective decision-making process of human, where the inquiry module first proposes the most valuable symptom inquiry, and then the introspective module intervenes the potential responses of this inquiry and decides to inquire only if the diagnoses of these interventions vary.",
+    "title": "Towards a Reliable and Robust Dialogue System for Medical Automatic Diagnosis",
+    "authors": [
+      "Junfan Lin",
+      "Lin Xu",
+      "Ziliang Chen",
+      "Liang Lin"
+    ],
+    "emails": [
+      "~Liang_Lin1",
+      "~Junfan_Lin1",
+      "~Ziliang_Chen1",
+      "~Lin_Xu3"
+    ],
+    "rank": 1319
+  },
+  {
+    "url": "https://openreview.net/forum?id=3SV-ZePhnZM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "The capability of incrementally learning new tasks without forgetting old ones is a challenging problem due to catastrophic forgetting. This challenge becomes greater when novel tasks contain very few labelled training samples. Currently, most methods are dedicated to class-incremental learning and rely on sufficient training data to learn additional weights for newly added classes. Those methods cannot be easily extended to incremental regression tasks and could suffer from severe overfitting when learning few-shot novel tasks. In this study, we propose a nonparametric method in deep embedded space to tackle incremental few-shot learning problems. The knowledge about the learned tasks are compressed into a small number of quantized reference vectors. The proposed method learns new tasks sequentially by adding more reference vectors to the model using few-shot samples in each novel task. For classification problems, we employ the nearest neighbor scheme to make classification on sparsely available data and incorporate intra-class variation, less forgetting regularization and calibration of reference vectors to mitigate catastrophic forgetting. In addition, the proposed learning vector quantization (LVQ) in deep embedded space can be customized as a kernel smoother to handle incremental few-shot regression tasks. Experimental results demonstrate that the proposed method outperforms other state-of-the-art methods in incremental learning.",
+    "title": "Incremental few-shot learning via vector quantization in deep embedded space",
+    "authors": [
+      "Kuilin Chen",
+      "Chi-Guhn Lee"
+    ],
+    "emails": [
+      "~Chi-Guhn_Lee1",
+      "~Kuilin_Chen1"
+    ],
+    "rank": 1320
+  },
+  {
+    "url": "https://openreview.net/forum?id=PvVbsAmxdlZ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Deep reinforcement learning (DRL) has demonstrated impressive performance in various gaming simulators and real-world applications. In practice, however, a DRL agent may receive faulty observation by abrupt interferences such as black-out, frozen-screen, and adversarial perturbation. How to design a resilient DRL algorithm against these rare but mission-critical and safety-crucial scenarios is an important yet challenging task. In this paper, we consider a resilient DRL framework with observational interferences. Under this framework, we discuss the importance of the causal relation and propose a causal inference based DRL algorithm called causal inference Q-network (CIQ). We evaluate the performance of CIQ in several benchmark DRL environments with different types of interferences. Our experimental results show that the proposed CIQ method could achieve higher performance and more resilience against observational interferences.",
+    "title": "Causal Inference Q-Network: Toward Resilient Reinforcement Learning",
+    "authors": [
+      "Chao-Han Huck Yang",
+      "Danny I-Te Hung",
+      "Yi Ouyang",
+      "Pin-Yu Chen"
+    ],
+    "emails": [
+      "columbia.edu",
+      "~Yi_Ouyang1",
+      "~Pin-Yu_Chen1",
+      "~Chao-Han_Huck_Yang1"
+    ],
+    "rank": 1321
+  },
+  {
+    "url": "https://openreview.net/forum?id=6puUoArESGp",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Concept-based explanation approach is a popular model interpertability tool because it expresses the reasons for a model's predictions in terms of concepts that are meaningful for the domain experts. In this work, we study the problem of the concepts being correlated with confounding information in the features. We propose a new causal prior graph for modeling the impacts of unobserved variables and a method to remove the impact of confounding information and noise using a two-stage regression technique borrowed from the instrumental variable literature. We also model the completeness of the concepts set and show that our debiasing method works when the concepts are not complete. Our synthetic and real-world experiments demonstrate the success of our method in removing biases and improving the ranking of the concepts in terms of their contribution to the explanation of the predictions.",
+    "title": "Debiasing Concept-based Explanations with Causal Analysis",
+    "authors": [
+      "Mohammad Taha Bahadori",
+      "David Heckerman"
+    ],
+    "emails": [
+      "~David_Heckerman1",
+      "~Mohammad_Taha_Bahadori1"
+    ],
+    "rank": 1322
+  },
+  {
+    "url": "https://openreview.net/forum?id=6X_32jLUaDg",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Self-driving cars must detect other vehicles and pedestrians in 3D to plan safe routes and avoid collisions. \nState-of-the-art 3D object detectors, based on deep learning, have shown promising accuracy but are prone to over-fit to domain idiosyncrasies, causing them to fail in new environments---a serious problem if autonomous vehicles are meant to operate freely. In this paper, we propose a novel learning approach that drastically reduces this gap by fine-tuning the detector on pseudo-labels in the target domain, which our method generates while the vehicle is parked, based on replays of previously recorded driving sequences. In these replays, objects are tracked over time, and detections are interpolated and extrapolated---crucially, leveraging future information to catch hard cases. We show, on five autonomous driving datasets, that fine-tuning the detector on these pseudo-labels substantially reduces the domain-gap to new driving environments, yielding drastic improvements in accuracy and detection reliability.",
+    "title": "Exploiting Playbacks in Unsupervised Domain Adaptation for 3D Object Detection",
+    "authors": [
+      "Yurong You",
+      "Carlos Andres Diaz-Ruiz",
+      "Yan Wang",
+      "Wei-Lun Chao",
+      "Bharath Hariharan",
+      "Mark Campbell",
+      "Kilian Q Weinberger"
+    ],
+    "emails": [
+      "~Yan_Wang10",
+      "~Wei-Lun_Chao1",
+      "~Bharath_Hariharan3",
+      "~Kilian_Q_Weinberger1",
+      "~Mark_Campbell1",
+      "~Carlos_Andres_Diaz-Ruiz1",
+      "~Yurong_You1"
+    ],
+    "rank": 1323
+  },
+  {
+    "url": "https://openreview.net/forum?id=WDVD4lUCTzU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper presents a novel training method, Conditional Masked Language Modeling (CMLM), to effectively learn sentence representations on large scale unlabeled corpora. CMLM integrates sentence representation learning into MLM training by conditioning on the encoded vectors of adjacent sentences. Our English CMLM model achieves state-of-the-art performance on SentEval, even outperforming models learned using (semi-)supervised signals. As a fully unsupervised learning method, CMLM can be conveniently extended to a broad range of languages and domains. We find that a multilingual CMLM model co-trained with bitext retrieval~(BR) and natural language inference~(NLI) tasks outperforms the previous state-of-the-art multilingual models by a large margin. We explore the same language bias of the learned representations, and propose a principle component based approach to remove the language identifying information from the representation while still retaining sentence semantics.",
+    "title": "Universal Sentence Representations Learning with Conditional Masked Language Model",
+    "authors": [
+      "Ziyi Yang",
+      "Yinfei Yang",
+      "Daniel M Cer",
+      "Jax Law",
+      "Eric Darve"
+    ],
+    "emails": [
+      "~Ziyi_Yang1",
+      "~Daniel_M_Cer1",
+      "google.com",
+      "~Eric_Darve1",
+      "~Yinfei_Yang1"
+    ],
+    "rank": 1324
+  },
+  {
+    "url": "https://openreview.net/forum?id=KIS8jqLp4fQ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Protecting privacy in learning while maintaining the model performance has become increasingly critical in many applications that involve sensitive data. Private Gradient Descent (PGD) is a commonly used private learning framework, which adds noise according to the Differential Privacy protocol.Recent studies show that dynamic privacy schedules of decreasing noise magnitudes can improve loss at the final iteration, and yet theoretical understandings of the effectiveness of such schedules and their connections to optimization algorithms remain limited. In this paper, we provide comprehensive analysis of noise influence in dynamic privacy schedules to answer these critical questions. We first present a dynamic noise schedule minimizing the utility upper bound of PGD, and show how the noise influence from each optimization step collectively impacts utility of the final model. Our study also reveals how impacts from dynamic noise influence change when momentum is used. We empirically show the connection exists for general non-convex losses, and the influence is greatly impacted by the loss curvature.",
+    "title": "On Dynamic Noise Influence in Differential Private Learning",
+    "authors": [
+      "Junyuan Hong",
+      "Zhangyang Wang",
+      "Jiayu Zhou"
+    ],
+    "emails": [
+      "~Junyuan_Hong1",
+      "~Zhangyang_Wang1",
+      "~Jiayu_Zhou1"
+    ],
+    "rank": 1325
+  },
+  {
+    "url": "https://openreview.net/forum?id=EQfpYwF3-b",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      4,
+      7
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "A common approach for compressing Natural Language Processing (NLP) networks is to encode the embedding layer as a matrix <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><mi>n</mi><mo>\u00d7</mo><mi>d</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>, compute its rank-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D457 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>j</mi></math></mjx-assistive-mml></mjx-container> approximation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D457 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>A</mi><mi>j</mi></msub></math></mjx-assistive-mml></mjx-container> via SVD (Singular Value Decomposition), and then factor <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D457 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>A</mi><mi>j</mi></msub></math></mjx-assistive-mml></mjx-container> into a pair of matrices that correspond to smaller fully-connected layers to replace the original embedding layer. Geometrically, the rows of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container> represent points in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup></math></mjx-assistive-mml></mjx-container>, and the rows of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D457 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>A</mi><mi>j</mi></msub></math></mjx-assistive-mml></mjx-container> represent their projections onto the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D457 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>j</mi></math></mjx-assistive-mml></mjx-container>-dimensional subspace that minimizes the sum of squared distances (``errors'') to the points. \nIn practice, these rows of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container> may be spread around <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi><mo>&gt;</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container> subspaces, so factoring <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container> based on a single subspace may lead to large errors that turn into large drops in accuracy.\n\nInspired by \\emph{projective clustering} from computational geometry,  we suggest replacing this subspace by a set of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> subspaces, each of dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D457 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>j</mi></math></mjx-assistive-mml></mjx-container>, that minimizes the sum of squared distances over every point (row in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"13\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container>) to its \\emph{closest} subspace. Based on this approach, we provide a novel architecture that replaces the original embedding layer by a set of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"14\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> small layers that operate in parallel and are then recombined with a single fully-connected layer. \n\nExtensive experimental results on the GLUE benchmark yield networks that are both more accurate and smaller compared to the standard matrix factorization (SVD). For example, we further compress DistilBERT by reducing the size of the embedding layer by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"15\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>40</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> while incurring only a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"16\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> average drop in accuracy over all nine GLUE tasks, compared to a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"17\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2.8</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> drop using the existing SVD approach.\nOn RoBERTa we achieve <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"18\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>43</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> compression of the embedding layer with less than a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"19\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.8</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> average drop in accuracy as compared to a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"20\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> drop previously.",
+    "title": "Deep Learning meets Projective Clustering",
+    "authors": [
+      "Alaa Maalouf",
+      "Harry Lang",
+      "Daniela Rus",
+      "Dan Feldman"
+    ],
+    "emails": [
+      "~Dan_Feldman1",
+      "~Harry_Lang1",
+      "~Alaa_Maalouf1",
+      "~Daniela_Rus1"
+    ],
+    "rank": 1326
+  },
+  {
+    "url": "https://openreview.net/forum?id=B9t708KMr9d",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph neural network (GNN) and label propagation algorithm (LPA) are both message passing algorithms, which have achieved superior performance in semi-supervised classification. GNN performs \\emph{feature propagation} by a neural network to make predictions, while LPA uses \\emph{label propagation} across graph adjacency matrix to get results. However, there is still no good way to combine these two kinds of algorithms. In this paper, we proposed a new {\\bf Uni}fied {\\bf M}essage {\\bf P}assaging Model (UniMP) that can incorporate \\emph{feature propagation} and \\emph{label propagation} with a shared message passing network, providing a better performance in semi-supervised classification. First, we adopt a Graph Transformer jointly label embedding to propagate both the feature and label information. Second, to train UniMP without overfitting in self-loop label information, we propose a masked label prediction strategy, in which some percentage of training labels are simply masked at random, and then predicted. UniMP conceptually unifies feature propagation and label propagation and be empirically powerful. It obtains new state-of-the-art semi-supervised classification results in Open Graph Benchmark (OGB). ",
+    "title": "Masked Label Prediction: Unified Message Passing Model for Semi-Supervised Classification",
+    "authors": [
+      "Yunsheng Shi",
+      "Zhengjie Huang",
+      "shikun feng",
+      "Hui Zhong",
+      "Wenjin Wang",
+      "Yu Sun"
+    ],
+    "emails": [
+      "baidu.com",
+      "~shikun_feng1"
+    ],
+    "rank": 1327
+  },
+  {
+    "url": "https://openreview.net/forum?id=xrUySgB5ZOK",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Recent works have shown that features obtained from supervised training of CNNs may over-emphasize texture rather than encoding high-level information.  In self-supervised learning, in particular, texture as a low-level cue may provide shortcuts that prevent the network from learning higher-level representations.  To address these problems we propose to use classic methods based on anisotropic diffusion to augment training using images with suppressed texture. This simple method helps retain important edge information and suppress texture at the same time. \nWe report our observations for fully supervised and self-supervised learning tasks like MoCoV2 and Jigsaw and achieve state-of-the-art results on object detection and image classification with eight diverse datasets. \nOur method is particularly effective for transfer learning tasks and we observed improved performance on five standard transfer learning datasets. \nThe large improvements on the Sketch-ImageNet dataset, DTD dataset and\nadditional visual analyses of saliency maps suggest that our approach helps in learning better representations that transfer well.",
+    "title": "Learning Visual Representations for Transfer Learning by Suppressing Texture",
+    "authors": [
+      "Shlok Kumar Mishra",
+      "Anshul Shah",
+      "Ankan Bansal",
+      "Jonghyun Choi",
+      "Abhinav Shrivastava",
+      "Abhishek Sharma",
+      "David Jacobs"
+    ],
+    "emails": [
+      "~Abhinav_Shrivastava2",
+      "~Shlok_Kumar_Mishra1",
+      "~David_Jacobs1",
+      "~Anshul_Shah1",
+      "~Jonghyun_Choi1",
+      "~Abhishek_Sharma4",
+      "~Ankan_Bansal1"
+    ],
+    "rank": 1328
+  },
+  {
+    "url": "https://openreview.net/forum?id=CLYe1Yke1r",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      6,
+      4,
+      4
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Learning representations of entities and relations in knowledge graphs is an active area of research, with much emphasis placed on choosing the appropriate geometry to capture tree-like structures. Box embeddings (Vilnis et al., 2018; Li et al., 2019; Dasgupta et al., 2020), which represent concepts as n-dimensional hyperrectangles, are capable of embedding trees by training on a subset of the transitive closure. In Patel et al. (2020), the authors demonstrate that only the transitive reduction is required, and further extend box embeddings to capture joint hierarchies by augmenting the graph with new nodes.  While it is possible to represent joint hierarchies with this method, the parameters for each hierarchy are decoupled, making generalization between hierarchies infeasible. In this work, we introduce a learned box-to-box transformation which respects the geometric structure of the box embeddings.  We demonstrate that this not only improves the capability of modeling cross-hierarchy compositional edges but is also capable of generalizing from a subset of the transitive reduction.",
+    "title": "Box-To-Box Transformation for Modeling Joint Hierarchies",
+    "authors": [
+      "Shib Sankar Dasgupta",
+      "Xiang Li",
+      "Michael Boratko",
+      "Dongxu Zhang",
+      "Andrew McCallum"
+    ],
+    "emails": [
+      "~Xiang_Li2",
+      "~Michael_Boratko1",
+      "~Shib_Sankar_Dasgupta2",
+      "~Andrew_McCallum1",
+      "~Dongxu_Zhang1"
+    ],
+    "rank": 1329
+  },
+  {
+    "url": "https://openreview.net/forum?id=OItp-Avs6Iy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Learning 3D representations that generalize well to arbitrarily oriented inputs is a challenge of practical importance in applications varying from computer vision to physics and chemistry.\nWe propose a novel multi-resolution convolutional architecture for learning over concentric spherical feature maps, of which the single sphere representation is a special case.\nOur hierarchical architecture is based on alternatively learning to incorporate both intra-sphere and inter-sphere information. \nWe show the applicability of our method for two different types of 3D inputs, mesh objects, which can be regularly sampled, and point clouds, which are irregularly distributed. \nWe also propose an efficient mapping of point clouds to concentric spherical images using radial basis functions, thereby bridging spherical convolutions on grids with general point clouds.\nWe demonstrate the effectiveness of our approach in achieving state-of-the-art performance on 3D classification tasks with rotated data.",
+    "title": "Concentric Spherical GNN for 3D Representation Learning",
+    "authors": [
+      "James S Fox",
+      "Bo Zhao",
+      "Sivasankaran Rajamanickam",
+      "Rampi Ramprasad",
+      "Le Song"
+    ],
+    "emails": [
+      "gatech.edu",
+      "~Le_Song1",
+      "mse.gatech",
+      "~James_S_Fox1",
+      "sandia.gov"
+    ],
+    "rank": 1330
+  },
+  {
+    "url": "https://openreview.net/forum?id=zg4GtrVQAKo",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      3,
+      1
+    ],
+    "abstract": "We consider the problem of information retrieval from a dataset of files stored on a single server under both a user distortion and a user privacy constraint. Specifically, a user requesting a file from the dataset should be able to reconstruct the requested file with a prescribed distortion, and in addition, the identity of the requested file should be kept private from the server with a prescribed privacy level. The proposed model can be seen as an extension of the well-known concept of private information retrieval by allowing for distortion in the retrieval process and relaxing the perfect privacy requirement. We initiate the study of the tradeoff between download rate, distortion, and user privacy leakage, and show that the optimal rate-distortion-leakage tradeoff is convex and that it allows for a concise information-theoretical formulation in terms of mutual information in the limit of large file sizes. Moreover, we propose a new data-driven framework by leveraging recent advancements in generative adversarial models which allows a user to learn efficient schemes in terms of download rate from the data itself. Learning the scheme is formulated as a constrained minimax game between a user which desires to keep the identity of the requested file private and an adversary that tries to infer which file the user is interested in under a distortion constraint. In general, guaranteeing a certain privacy level leads to a higher rate-distortion tradeoff curve, and hence a sacrifice in either download rate or distortion. We evaluate the performance of the scheme on a synthetic Gaussian dataset as well as on the MNIST dataset. For the MNIST dataset, the data-driven approach significantly outperforms a proposed general achievable scheme combining source coding with the download of multiple files.",
+    "title": "Generative Adversarial User Privacy in Lossy Single-Server Information Retrieval",
+    "authors": [
+      "Chung-Wei Weng",
+      "Yauhen Yakimenka",
+      "Hsuan-Yin Lin",
+      "Eirik Rosnes",
+      "Joerg Kliewer"
+    ],
+    "emails": [
+      "~Eirik_Rosnes1",
+      "~Joerg_Kliewer1",
+      "simula.no"
+    ],
+    "rank": 1331
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mwuc0Plt_x2",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Flow-based generative models have become an important class of unsupervised learning approaches. In this work, we incorporate the key idea of renormalization group (RG) and sparse prior distribution to design a hierarchical flow-based generative model, called RG-Flow, which can separate different scale information of images with disentangle representations at each scale. We demonstrate our method mainly on the CelebA dataset and show that the disentangled representation at different scales enables semantic manipulation and style mixing of the images. To visualize the latent representation, we introduce the receptive fields for flow-based models and find receptive fields learned by RG-Flow are similar to convolutional neural networks. In addition, we replace the widely adopted Gaussian prior distribution by sparse prior distributions to further enhance the disentanglement of representations. From a theoretical perspective, the proposed method has <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>L</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> complexity for image inpainting compared to previous flow-based models with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>L</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> complexity.",
+    "title": "RG-Flow: A hierarchical and explainable flow model based on renormalization group and sparse prior",
+    "authors": [
+      "Hong-Ye Hu",
+      "Dian Wu",
+      "Yi-Zhuang You",
+      "Bruno Olshausen",
+      "Yubei Chen"
+    ],
+    "emails": [
+      "~Yubei_Chen1",
+      "pku.edu.cn",
+      "physics.ucsd.edu",
+      "~Bruno_Olshausen1",
+      "~Hong-Ye_Hu1"
+    ],
+    "rank": 1332
+  },
+  {
+    "url": "https://openreview.net/forum?id=TTLwOwNkOfx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      7,
+      3
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "There exists a need for unsupervised 3D segmentation on complex volumetric data, particularly when annotation ability is limited or discovery of new categories is desired. Using the observation that much of 3D volumetric data is innately hierarchical, we propose learning effective representations of 3D patches for unsupervised segmentation through a variational autoencoder (VAE) with a hyperbolic latent space and a proposed gyroplane convolutional layer, which better models the underlying hierarchical structure within a 3D image. We also introduce a hierarchical triplet loss and multi-scale patch sampling scheme to embed relationships across varying levels of granularity. We demonstrate the effectiveness of our hyperbolic representations for unsupervised 3D segmentation on a hierarchical toy dataset, BraTS whole tumor dataset, and cryogenic electron microscopy data.\n",
+    "title": "Learning Hyperbolic Representations for Unsupervised 3D Segmentation",
+    "authors": [
+      "Joy Hsu",
+      "Jeffrey Gu",
+      "Gong Her Wu",
+      "Wah Chiu",
+      "Serena Yeung"
+    ],
+    "emails": [
+      "~Serena_Yeung1",
+      "~Joy_Hsu2",
+      "stanford.edu",
+      "~Jeffrey_Gu1"
+    ],
+    "rank": 1333
+  },
+  {
+    "url": "https://openreview.net/forum?id=v2tmeZVV9-c",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Iterative solvers are widely used to accurately simulate physical systems. These solvers require initial guesses to generate a sequence of improving approximate solutions. In this contribution, we introduce a novel method to accelerate iterative solvers for rod dynamics with graph networks (GNs) by predicting the initial guesses to reduce the number of iterations. Unlike existing methods that aim to learn physical systems in an end-to-end manner, our approach guarantees long-term stability and therefore leads to more accurate solutions. Furthermore, our method improves the run time performance of traditional iterative solvers for rod dynamics. To explore our method we make use of position-based dynamics (PBD) as a common solver for physical systems and evaluate it by simulating the dynamics of elastic rods. Our approach is able to generalize across different initial conditions, discretizations, and realistic material properties. We demonstrate that it also performs well when taking discontinuous effects into account such as collisions between individual rods. Finally, to illustrate the scalability of our approach, we simulate complex 3D tree models composed of over a thousand individual branch segments swaying in wind fields.",
+    "title": "Accurately Solving Rod Dynamics with Graph Learning",
+    "authors": [
+      "Han Shao",
+      "Tassilo Kugelstadt",
+      "Torsten H\u00e4drich",
+      "Wojciech Pa\u0142ubicki",
+      "Jan Bender",
+      "Soren Pirk",
+      "Dominik Michels"
+    ],
+    "emails": [
+      "~Han_Shao3",
+      "~Jan_Bender1",
+      "~Soren_Pirk2",
+      "cs.rwth-aachen.de",
+      "kaust.edu.sa",
+      "amu.edu.pl",
+      "~Dominik_Michels1"
+    ],
+    "rank": 1334
+  },
+  {
+    "url": "https://openreview.net/forum?id=yJHpncwG1B",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Federated learning (FL) learns a model jointly from a set of participating devices without sharing each other's privately held data. The characteristics of non-\\textit{i.i.d.} data across the network, low device participation, high communication costs, and the mandate that data remain private bring challenges in understanding the convergence of FL algorithms, particularly in regards to how convergence scales with the number of participating devices. In this paper, we focus on Federated Averaging (FedAvg)--arguably the most popular and effective FL algorithm class in use today--and provide a unified and comprehensive study of its convergence rate. Although FedAvg has recently been studied by an emerging line of literature, it remains open as to how FedAvg's convergence scales with the number of participating devices in the fully heterogeneous FL setting--a crucial question whose answer would shed light on the performance of FedAvg in large FL systems. We fill this gap by providing a unified analysis that establishes convergence guarantees for FedAvg under three classes of problems: strongly convex smooth, convex smooth, and overparameterized strongly convex smooth problems. We show that FedAvg enjoys linear speedup in each case, although with different convergence rates and communication efficiencies. While there have been linear speedup results from distributed optimization that assumes full participation, ours are the first to establish linear speedup for FedAvg under both statistical and system heterogeneity.  For strongly convex and convex problems, we also characterize the corresponding convergence rates for the Nesterov accelerated FedAvg algorithm, which are the first linear speedup guarantees for momentum variants of FedAvg in the convex setting. To provably accelerate FedAvg, we design a new momentum-based FL algorithm that further improves the convergence rate in overparameterized linear regression problems. Empirical studies of the algorithms in various settings have supported our theoretical results.",
+    "title": "Federated Learning's Blessing: FedAvg has Linear Speedup",
+    "authors": [
+      "Zhaonan Qu",
+      "Kaixiang Lin",
+      "Zhaojian Li",
+      "Jiayu Zhou",
+      "Zhengyuan Zhou"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Zhengyuan_Zhou2",
+      "egr.msu.edu",
+      "~Jiayu_Zhou1",
+      "~Kaixiang_Lin1"
+    ],
+    "rank": 1335
+  },
+  {
+    "url": "https://openreview.net/forum?id=KubHAaKdSr7",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Large Transformer models have achieved impressive performance in many natural language tasks. In particular, Transformer based language models have been shown to have great capabilities in encoding factual knowledge in their vast amount of parameters. While the tasks of improving the memorization and generalization of Transformers have been widely studied, it is not well known how to make transformers forget specific old facts and memorize new ones. In this paper, we propose a new task of explicitly modifying specific factual knowledge in Transformer models while ensuring the model performance does not degrade on the unmodified facts. This task is useful in many scenarios, such as updating stale knowledge, protecting privacy, and eliminating unintended biases stored in the models. We benchmarked several approaches that provide natural baseline performances on this task. This leads to the discovery of key components of a Transformer model that are especially effective for knowledge modifications. The work also provides insights into the role that different training phases (such as pretraining and fine-tuning) play towards memorization and knowledge modification.",
+    "title": "Modifying Memories in Transformer Models",
+    "authors": [
+      "Chen Zhu",
+      "Ankit Singh Rawat",
+      "Manzil Zaheer",
+      "Srinadh Bhojanapalli",
+      "Daliang Li",
+      "Felix Yu",
+      "Sanjiv Kumar"
+    ],
+    "emails": [
+      "~Ankit_Singh_Rawat1",
+      "~Manzil_Zaheer1",
+      "~Chen_Zhu2",
+      "~Srinadh_Bhojanapalli1",
+      "~Daliang_Li1",
+      "~Sanjiv_Kumar1",
+      "~Felix_Yu1"
+    ],
+    "rank": 1336
+  },
+  {
+    "url": "https://openreview.net/forum?id=iQxS0S9ir1a",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      7
+    ],
+    "rating": "5.50",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We introduce a new notion of generalization--- Distributional Generalization--- which roughly states that outputs of a classifier at train and test time are close as distributions, as opposed to close in just their average error. For example, if we mislabel 30% of dogs as cats in the train set of CIFAR-10, then a ResNet trained to interpolation will in fact mislabel roughly 30% of dogs as cats on the test set as well, while leaving other classes unaffected. This behavior is not captured by classical generalization, which would only consider the average error and not the distribution of errors over the input domain. Our formal conjectures, which are much more general than this example, characterize the form of distributional generalization that can be expected in terms of problem parameters: model architecture, training procedure, number of samples, and data distribution. We give empirical evidence for these conjectures across a variety of domains in machine learning, including neural networks, kernel machines, and decision trees. Our results thus advance our understanding of interpolating classifiers.",
+    "title": "Distributional Generalization: A New Kind of Generalization",
+    "authors": [
+      "Preetum Nakkiran",
+      "Yamini Bansal"
+    ],
+    "emails": [
+      "~Yamini_Bansal1",
+      "~Preetum_Nakkiran1"
+    ],
+    "rank": 1337
+  },
+  {
+    "url": "https://openreview.net/forum?id=2LBhynkS2SC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Honey bees are a popular model for complex social systems, in which global behavior emerges from the actions and interactions of thousands of individuals. While the average life of a bee is organized as a sequence of tasks roughly determined by age, there is substantial variation at the individual level. For example, young bees can become foragers early in life, depending on the colony\u2019s needs. Using a unique dataset containing lifetime trajectories of all individuals over multiple generations in two honey bee colonies, we propose a new temporal matrix factorization model that jointly learns the average developmental path and structured variations of individuals in the social network over their entire lives. Our method yields inherently interpretable embeddings that are biologically plausible and consistent over time, which allow one to compare individuals regardless of when, or in which colony, they lived. Our method provides a quantitative framework for understanding behavioral heterogeneity in complex social systems applicable in fields such as behavioral biology, social sciences, neuroscience, and information science.",
+    "title": "Individuality in the hive - Learning to embed lifetime social behaviour of honey bees",
+    "authors": [
+      "Benjamin Wild",
+      "David Dormagen",
+      "Michael L Smith",
+      "Tim Landgraf"
+    ],
+    "emails": [
+      "ab.mpg.de",
+      "~Tim_Landgraf1",
+      "~Benjamin_Wild1",
+      "fu-berlin.de"
+    ],
+    "rank": 1338
+  },
+  {
+    "url": "https://openreview.net/forum?id=le9LIliDOG",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The efficient treatment of long-range interactions for point clouds is a challenging problem in many scientific machine learning applications. To extract global information, one usually needs a large window size, a large number of layers, and/or a large number of channels. This can often significantly increase the computational cost. In this work, we present a novel neural network layer that directly incorporates long-range information for a point cloud. This layer, dubbed the long-range convolutional (LRC)-layer, leverages the convolutional theorem coupled with the non-uniform Fourier transform. In a nutshell, the LRC-layer mollifies the point cloud to an adequately sized regular grid, computes its Fourier transform, multiplies the result by a set of trainable Fourier multipliers, computes the inverse Fourier transform, and finally interpolates the result back to the point cloud. The resulting global all-to-all convolution operation can be performed in nearly-linear time asymptotically with respect to the number of input points. The LRC-layer is a particularly powerful tool when combined with local convolution as together they offer efficient and seamless treatment of both short and long range interactions. We showcase this framework by introducing a neural network architecture that combines LRC-layers with short-range convolutional layers to accurately learn the energy and force associated with a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container>-body potential.  We also exploit the induced two-level decomposition and propose an efficient strategy to train the combined architecture with a reduced number of samples.",
+    "title": "Efficient Long-Range Convolutions for Point Clouds",
+    "authors": [
+      "Yifan Peng",
+      "Lin Lin",
+      "Lexing Ying",
+      "Leonardo Zepeda-Nunez"
+    ],
+    "emails": [
+      "~Lin_Lin1",
+      "~Lexing_Ying1",
+      "sjtu.edu.cn",
+      "~Leonardo_Zepeda-Nunez1"
+    ],
+    "rank": 1339
+  },
+  {
+    "url": "https://openreview.net/forum?id=Cue2ZEBf12",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Recent works have applied Bayesian Neural Network (BNN) to adversarial training, and shown the improvement of adversarial robustness via the BNN's strength of stochastic gradient defense. However, we have found that in general, the BNN loses its stochasticity after its training with the BNN's posterior. As a result, the lack of the stochasticity leads to weak regularization effect to the BNN, which increases KL divergence in ELBO from variational inference. In this paper, we propose an enhanced Bayesian regularizer through hierarchical variational inference in order to boost adversarial robustness against gradient-based attack. Furthermore, we also prove that the proposed method allows the BNN's stochasticity to be elevated with the reduced KL divergence. Exhaustive experiment results demonstrate the effectiveness of the proposed method by showing the improvement of adversarial robustness, compared with adversarial training (Madry et al., 2018) and adversarial-BNN (Liu et al., 2019) under PGD attack and EOT-PGD attack to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> perturbation on CIFAR-10/100, STL-10, and Tiny-ImageNet.",
+    "title": "Towards Adversarial Robustness of Bayesian Neural Network through Hierarchical Variational Inference",
+    "authors": [
+      "Byung-Kwan Lee",
+      "Youngjoon Yu",
+      "Yong Man Ro"
+    ],
+    "emails": [
+      "~Byung-Kwan_Lee1",
+      "~Yong_Man_Ro1",
+      "~Youngjoon_Yu1"
+    ],
+    "rank": 1340
+  },
+  {
+    "url": "https://openreview.net/forum?id=GH7QRzUDdXG",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Generative adversarial networks (GANs) have emerged as a powerful unsupervised method to model the statistical patterns of real-world data sets, such as natural images. These networks are trained to map random inputs in their latent space to new samples representative of the learned data. However, the structure of the latent space is hard to intuit due to its high dimensionality and the non-linearity of the generator, which limits the usefulness of the models. Understanding the latent space requires a way to identify input codes for existing real-world images (inversion), and a way to identify directions with known image transformations (interpretability). Here, we use a geometric framework to address both issues simultaneously. We develop an architecture-agnostic method to compute the Riemannian metric of the image manifold created by GANs. The eigen-decomposition of the metric isolates axes that account for different levels of image variability. An empirical analysis of several pretrained GANs shows that image variation around each position is concentrated along surprisingly few major axes (the space is highly anisotropic) and the directions that create this large variation are similar at different positions in the space (the space is homogeneous). We show that many of the top eigenvectors correspond to interpretable transforms in the image space, with a substantial part of eigenspace corresponding to minor transforms which could be compressed out. This geometric understanding unifies key previous results related to GAN interpretability. We show that the use of this metric allows for more efficient optimization in the latent space (e.g. GAN inversion) and facilitates unsupervised discovery of interpretable axes. Our results illustrate that defining the geometry of the GAN image manifold can serve as a general framework for understanding GANs. ",
+    "title": "A Geometric Analysis of Deep Generative Image Models and Its Applications",
+    "authors": [
+      "Binxu Wang",
+      "Carlos R Ponce"
+    ],
+    "emails": [
+      "~Binxu_Wang1",
+      "~Carlos_R_Ponce1"
+    ],
+    "rank": 1341
+  },
+  {
+    "url": "https://openreview.net/forum?id=uFBBOJ7xnu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Events in the real world are correlated across nearby points in time, and we must learn from this temporally \u201csmooth\u201d data. However, when neural networks are trained to categorize or reconstruct single items, the common practice is to randomize the order of training items. What are the effects of temporally smooth training data on the efficiency of learning? We first tested the effects of smoothness in training data on incremental learning in feedforward nets and found that smoother data slowed learning. Moreover, sampling so as to minimize temporal smoothness produced  more efficient learning than sampling randomly. If smoothness generally impairs incremental learning, then how can networks be modified to benefit from smoothness in the training data? We hypothesized that  two simple brain-inspired mechanisms -- leaky memory in activation units and memory-gating -- could enable networks to exploit the redundancies in smooth data. Across all levels of data smoothness, these brain-inspired architectures achieved more efficient category learning than feedforward networks. Finally, we investigated how  these brain-inspired mechanisms altered the internal representations learned by the networks. We found that networks with multi-scale leaky memory and memory-gating could learn internal representations that \u201cun-mixed\u201d data sources which vary on fast and slow timescales across training samples. Altogether, we identified simple mechanisms enabling neural networks to learn more quickly from temporally smooth data, and to generate internal representations that separate timescales in the training signal.",
+    "title": "Learning representations from temporally smooth data",
+    "authors": [
+      "Shima Rahimi Moghaddam",
+      "Fanjun Bu",
+      "Christopher Honey"
+    ],
+    "emails": [
+      "~Shima_Rahimi_Moghaddam1",
+      "~Christopher_Honey1",
+      "jhu.edu"
+    ],
+    "rank": 1342
+  },
+  {
+    "url": "https://openreview.net/forum?id=8QAXsAOSBjE",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Conversational analysis systems are trained using noisy human labels and often require heavy preprocessing during multi-modal feature extraction. Using noisy labels in single-task learning increases the risk of over-fitting. However, auxiliary tasks could improve the performance of the primary task learning. This approach is known as Primary Multi-Task Learning (MTL). A challenge of MTL is the selection of beneficial auxiliary tasks that avoid negative transfer. In this paper, we explore how the preprocessed data used for feature engineering can be re-used as auxiliary tasks in Primary MTL, thereby promoting the productive use of data in the form of auxiliary supervision learning. Our main contributions are: (1) the identification of sixteen beneficially auxiliary tasks, (2) the method of distributing learning capacity between the primary and auxiliary tasks, and (3) the relative supervision hierarchy between the primary and auxiliary tasks. Extensive experiments on IEMOCAP and SEMAINE data validate the improvements over single-task approaches, and suggest that it may generalize across multiple primary tasks.",
+    "title": "Reusing Preprocessing Data as Auxiliary Supervision in Conversational Analysis",
+    "authors": [
+      "Joshua Yee Kim",
+      "Kalina Yacef"
+    ],
+    "emails": [
+      "sydney.edu.au",
+      "~Joshua_Yee_Kim1"
+    ],
+    "rank": 1343
+  },
+  {
+    "url": "https://openreview.net/forum?id=4c3WeBTErrE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recurrent neural networks (RNNs) can learn complex, long-range structure in time series data simply by predicting one point at a time. Because of this ability, they have enjoyed widespread adoption in commercial and academic contexts. Yet RNNs have a fundamental limitation: they represent time as a series of discrete, uniform time steps. As a result, they force a tradeoff between temporal resolution and the computational expense of predicting far into the future. To resolve this tension, we propose a Jumpy RNN model which does not predict state transitions over uniform intervals of time. Instead, it predicts a sequence of linear dynamics functions in latent space and intervals of time over which their predictions can be expected to be accurate. This structure enables our model to jump over long time intervals while retaining the ability to produce fine-grained or continuous-time predictions when necessary. In simple physics simulations, our model can skip over long spans of predictable motion and focus on key events such as collisions between two balls. On a set of physics tasks including coordinate and pixel observations of a small-scale billiards environment, our model matches the performance of a baseline RNN while using a fifth of the compute. On a real-world weather forecasting dataset, it makes more accurate predictions while using fewer sampling steps. When used for model-based planning, our method matches a baseline RNN while using half the compute.",
+    "title": "Jumpy Recurrent Neural Networks",
+    "authors": [
+      "Samuel James Greydanus",
+      "Stefan Lee",
+      "Alan Fern"
+    ],
+    "emails": [
+      "~Stefan_Lee1",
+      "~Alan_Fern1",
+      "~Samuel_James_Greydanus1"
+    ],
+    "rank": 1344
+  },
+  {
+    "url": "https://openreview.net/forum?id=C4-QQ1EHNcI",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The Bayesian paradigm has the potential to solve some of the core issues in modern deep learning, such as poor calibration, data inefficiency, and catastrophic forgetting. However, scaling Bayesian inference to the high-dimensional parameter spaces of deep neural networks requires restrictive approximations. In this paper, we propose performing inference over only a small subset of the model parameters while keeping all others as point estimates. This enables us to use expressive posterior approximations that would otherwise be intractable for the full model. In particular, we develop a practical and scalable Bayesian deep learning method that first trains a point estimate, and then infers a full covariance Gaussian posterior approximation over a subnetwork. We propose a subnetwork selection procedure which aims to maximally preserve posterior uncertainty. We empirically demonstrate the effectiveness of our approach compared to point-estimated networks and methods that use less expressive posterior approximations over the full network.",
+    "title": "Expressive yet Tractable Bayesian Deep Learning via Subnetwork Inference",
+    "authors": [
+      "Erik Daxberger",
+      "Eric Nalisnick",
+      "James Allingham",
+      "Javier Antoran",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato"
+    ],
+    "emails": [
+      "cam.ac.uk",
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1",
+      "~Erik_Daxberger1",
+      "~Javier_Antoran1",
+      "~Eric_Nalisnick1"
+    ],
+    "rank": 1345
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZvvxYyjfvZc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "A common optimization tool used in deep reinforcement learning is momentum, which consists in accumulating and discounting past gradients, reapplying them at each iteration. We argue that, unlike in supervised learning, momentum in Temporal Difference (TD) learning accumulates gradients that become doubly stale: not only does the gradient of the loss change due to parameter updates, the loss itself changes due to bootstrapping. We first show that this phenomenon exists, and then propose a first-order correction term to momentum. We show that this correction term improves sample efficiency in policy evaluation by correcting target value drift. An important insight of this work is that deep RL methods are not always best served by directly importing techniques from the supervised setting.",
+    "title": "Correcting Momentum in Temporal Difference Learning",
+    "authors": [
+      "Emmanuel Bengio",
+      "Joelle Pineau",
+      "Doina Precup"
+    ],
+    "emails": [
+      "~Emmanuel_Bengio1",
+      "~Joelle_Pineau1",
+      "~Doina_Precup1"
+    ],
+    "rank": 1346
+  },
+  {
+    "url": "https://openreview.net/forum?id=Bpw_O132lWT",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Stochastic gradient descent (SGD) and its variants are mainstream methods to train deep neural networks. Since neural networks are non-convex, more and more works study the dynamic behavior of SGD and its impact to generalization, especially the escaping efficiency from local minima. However, these works make the over-simplified assumption that the distribution of gradient noise is state-independent, although it is state-dependent. In this work, we propose a novel power-law dynamic with state-dependent diffusion to approximate the dynamic of SGD. Then, we prove that the stationary distribution of power-law dynamic is heavy-tailed, which matches the existing empirical observations. Next, we study the escaping efficiency from local minimum of power-law dynamic and prove that the mean escaping time is in polynomial order of the barrier height of the basin, much faster than exponential order of previous dynamics. It indicates that SGD can escape deep sharp minima efficiently and tends to stop at flat minima that have lower generalization error. Finally, we conduct experiments to compare SGD and power-law dynamic, and the results verify our theoretical findings.",
+    "title": "Dynamic of Stochastic Gradient Descent with State-dependent Noise",
+    "authors": [
+      "Qi Meng",
+      "Shiqi Gong",
+      "Wei Chen",
+      "Zhi-Ming Ma",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Qi_Meng1",
+      "~Shiqi_Gong1",
+      "~Tie-Yan_Liu1",
+      "~Zhi-Ming_Ma1",
+      "~Wei_Chen1"
+    ],
+    "rank": 1347
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z532uNJyG5y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "How to discriminatively vectorize graphs is a fundamental challenge that attracts increasing attentions in recent years. Motivated by the recent success of unsupervised contrastive learning, we aim to learn graph-level representation in an unsupervised manner. Specifically, we propose a novel unsupervised graph learning paradigm called Iterative Graph Self-Distillation (IGSD) which iteratively performs the teacher-student distillation with graph augmentations. Different from conventional knowledge distillation, IGSD constructs the teacher with an exponential moving average of the student model and distills the knowledge of itself. The intuition behind IGSD is to predict the teacher network representation of the graph pairs under different augmented views. As a natural extension, we also apply IGSD to semi-supervised scenarios by jointly regularizing the network with both supervised and unsupervised contrastive loss. Finally, we show that finetuning the IGSD-trained models with self-training can further improve the graph representation power. Empirically, we achieve significant and consistent performance gain on various graph datasets in both unsupervised and semi-supervised settings, which well validates the superiority of IGSD.",
+    "title": "Iterative Graph Self-Distillation",
+    "authors": [
+      "Hanlin Zhang",
+      "Shuai Lin",
+      "Weiyang Liu",
+      "Pan Zhou",
+      "Jian Tang",
+      "Xiaodan Liang",
+      "Eric Xing"
+    ],
+    "emails": [
+      "~Pan_Zhou3",
+      "~Eric_Xing1",
+      "~Hanlin_Zhang1",
+      "~Weiyang_Liu1",
+      "~Jian_Tang1",
+      "~Xiaodan_Liang2",
+      "~Shuai_Lin1"
+    ],
+    "rank": 1348
+  },
+  {
+    "url": "https://openreview.net/forum?id=PYAFKBc8GL4",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Federated learning is a distributed optimization paradigm that enables a large number of resource-limited client nodes to cooperatively train a model without data sharing. Several works have analyzed the convergence of federated learning by accounting of data heterogeneity, communication and computation limitations, and partial client participation. However, they assume unbiased client participation, where clients are selected at random or in proportion of their data sizes. In this paper, we present the first convergence analysis of federated optimization for biased client selection strategies, and quantify how the selection bias affects convergence speed. We reveal that biasing client selection towards clients with higher local loss achieves faster error convergence. Using this insight, we propose Power-of-Choice, a communication- and computation-efficient client selection framework that can flexibly span the trade-off between convergence speed and solution bias. We also propose an extension of Power-of-Choice that is able to maintain convergence speed improvement while diminishing the selection skew. Our experiments demonstrate that Power-of-Choice strategies converge up to 3 <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> faster and give <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn></math></mjx-assistive-mml></mjx-container>% higher test accuracy than the baseline random selection. ",
+    "title": "Client Selection in Federated Learning: Convergence Analysis and Power-of-Choice Selection Strategies",
+    "authors": [
+      "Yae Jee Cho",
+      "Jianyu Wang",
+      "Gauri Joshi"
+    ],
+    "emails": [
+      "~Yae_Jee_Cho1",
+      "~Gauri_Joshi1",
+      "~Jianyu_Wang2"
+    ],
+    "rank": 1349
+  },
+  {
+    "url": "https://openreview.net/forum?id=punMXQEsPr0",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Understanding document from their visual snapshots is an emerging and challenging problem that requires both advanced computer vision and NLP methods. Although the recent advance in OCR enables the accurate extraction of text segments, it is still challenging to extract key information from documents due to the diversity of layouts. To compensate for the difficulties, this paper introduces a pre-trained language model, BERT Relying On Spatiality (BROS), that represents and understands the semantics of spatially distributed texts. Different from previous pre-training methods on 1D text, BROS is pre-trained on large-scale semi-structured documents with a novel area-masking strategy while efficiently including the spatial layout information of input documents. Also, to generate structured outputs in various document understanding tasks, BROS utilizes a powerful graph-based decoder that can capture the relation between text segments. BROS achieves state-of-the-art results on four benchmark tasks: FUNSD, SROIE*, CORD, and SciTSR. Our experimental settings and implementation codes will be publicly available.",
+    "title": "BROS: A Pre-trained Language Model for Understanding Texts in Document",
+    "authors": [
+      "Teakgyu Hong",
+      "DongHyun Kim",
+      "Mingi Ji",
+      "Wonseok Hwang",
+      "Daehyun Nam",
+      "Sungrae Park"
+    ],
+    "emails": [
+      "~Teakgyu_Hong1",
+      "~Mingi_Ji1",
+      "~DongHyun_Kim3",
+      "~Daehyun_Nam2",
+      "~Sungrae_Park1",
+      "~Wonseok_Hwang1"
+    ],
+    "rank": 1350
+  },
+  {
+    "url": "https://openreview.net/forum?id=10XWPuAro86",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Model-free reinforcement learning (RL), in particular <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-learning is widely used to learn optimal policies for a variety of planning and control problems. However, when the underlying state-transition dynamics are stochastic and high-dimensional, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-learning requires a large amount of data and incurs a prohibitively high computational cost. In this paper, we introduce Hamiltonian <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-Learning, a data efficient modification of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-learning approach, which adopts an importance-sampling based technique for computing the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> function. To exploit stochastic structure of the state-transition dynamics, we employ Hamiltonian Monte Carlo to update <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> function estimates by approximating the expected future rewards using <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> values associated with a subset of next states. Further, to exploit the latent low-rank structure of the dynamic system, Hamiltonian <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-Learning uses a matrix completion algorithm to reconstruct the updated <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> function from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> value updates over a much smaller subset of state-action pairs. By providing an efficient way to apply <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container>-learning in stochastic, high-dimensional problems, the proposed approach broadens the scope of RL algorithms for real-world applications, including classical control tasks and environmental monitoring.",
+    "title": "Hamiltonian Q-Learning: Leveraging Importance-sampling for Data Efficient RL",
+    "authors": [
+      "Udari Madhushani",
+      "Biswadip Dey",
+      "Naomi Leonard",
+      "Amit Chakraborty"
+    ],
+    "emails": [
+      "~Amit_Chakraborty2",
+      "~Naomi_Leonard1",
+      "~Biswadip_Dey2",
+      "~Udari_Madhushani1"
+    ],
+    "rank": 1351
+  },
+  {
+    "url": "https://openreview.net/forum?id=Kkw3shxszSd",
+    "decision": "Reject",
+    "ratings": [
+      9,
+      3,
+      4,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "While protein sequence data is an emerging application domain for machine learning methods, small modifications to protein sequences can result in difficult-to-predict changes to the protein's function. Consequently, protein machine learning models typically do not use randomized data augmentation procedures analogous to those used in computer vision or natural language, e.g., cropping or synonym substitution. In this paper, we empirically explore a set of simple string manipulations, which we use to augment protein sequence data when fine-tuning semi-supervised protein models. We provide 276 different comparisons to the Tasks Assessing Protein Embeddings (TAPE) baseline models, with Transformer-based models and training datasets that vary from the baseline methods only in the data augmentations and representation learning procedure. For each TAPE validation task, we demonstrate improvements to the baseline scores when the learned protein representation is fixed between tasks.  We also show that contrastive learning fine-tuning methods typically outperform masked-token prediction in these models, with increasing amounts of data augmentation generally improving performance for contrastive learning protein methods.  We find the most consistent results across TAPE tasks when using domain-motivated transformations, such as amino acid replacement, as well as restricting the Transformer attention to randomly sampled sub-regions of the protein sequence.  In rarer cases, we even find that information-destroying augmentations, such as randomly shuffling entire protein sequences, can improve downstream performance. ",
+    "title": "Improving Generalizability of Protein Sequence Models via Data Augmentations",
+    "authors": [
+      "Hongyu Shen",
+      "Layne C. Price",
+      "Mohammad Taha Bahadori",
+      "Franziska Seeger"
+    ],
+    "emails": [
+      "amazon.com",
+      "~Mohammad_Taha_Bahadori1"
+    ],
+    "rank": 1352
+  },
+  {
+    "url": "https://openreview.net/forum?id=avBunqDXFS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.47",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Rehearsal is a critical component for class-incremental continual learning, yet it requires a substantial memory budget. Our work investigates whether we can significantly reduce this memory budget by leveraging unlabeled data from an agent's environment in a realistic and challenging continual learning paradigm. Specifically, we explore and formalize a novel semi-supervised continual learning (SSCL) setting, where labeled data is scarce yet non-i.i.d. unlabeled data from the agent's environment is plentiful. Importantly, data distributions in the SSCL setting are realistic and therefore reflect object class correlations between, and among, the labeled and unlabeled data distributions. We show that a strategy built on pseudo-labeling, consistency regularization, Out-of-Distribution (OoD) detection, and knowledge distillation reduces forgetting in this setting. Our approach, DistillMatch, increases performance over the state-of-the-art by no less than 8.7% average task accuracy and up to a 54.5% increase in average task accuracy in SSCL CIFAR-100 experiments. Moreover, we demonstrate that DistillMatch can save up to 0.23 stored images per processed unlabeled image compared to the next best method which only saves 0.08. Our results suggest that focusing on realistic correlated distributions is a significantly new perspective, which accentuates the importance of leveraging the world's structure as a continual learning strategy.",
+    "title": "Memory-Efficient Semi-Supervised Continual Learning: The World is its Own Replay Buffer",
+    "authors": [
+      "James Smith",
+      "Jonathan C Balloch",
+      "Yen-Chang Hsu",
+      "Zsolt Kira"
+    ],
+    "emails": [
+      "~Yen-Chang_Hsu1",
+      "~Zsolt_Kira1",
+      "~James_Smith1",
+      "~Jonathan_C_Balloch1"
+    ],
+    "rank": 1353
+  },
+  {
+    "url": "https://openreview.net/forum?id=2LiGI26kRdt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Pre-trained language models, such as BERT, have achieved significant accuracy gain in many natural language processing tasks. Despite its effectiveness, the huge number of parameters makes training a BERT model  computationally very challenging.  In this paper, we propose an efficient multi-stage layerwise training (MSLT) approach to reduce the training time of BERT.  We decompose the whole training process into several stages. The training is started from a small model with only a few encoder layers and we gradually increase the depth of the model by adding new encoder layers. At each stage, we only train the top  (near the output layer)  few encoder layers which are newly added. The parameters of the other layers which have been trained in the previous stages will not be updated in the current stage. In  BERT training, the backward calculation is much more time-consuming than the forward calculation, especially in the distributed training setting in which the backward calculation time further includes the communication time for gradient synchronization. In the proposed training strategy,  only  top few layers  participate  backward calculation, while most  layers  only participate forward calculation. Hence both  the computation  and communication efficiencies are greatly improved.  Experimental results show that the proposed method can greatly reduce the training time without significant performance degradation.",
+    "title": "Progressively Stacking 2.0: A Multi-stage Layerwise Training Method for BERT Training Speedup",
+    "authors": [
+      "Cheng Yang",
+      "Shengnan Wang",
+      "Chao Yang",
+      "Yuechuan Li",
+      "Ru He",
+      "Jingqiao Zhang"
+    ],
+    "emails": [
+      "~Yuechuan_Li1",
+      "~Ru_He1",
+      "~Jingqiao_Zhang1",
+      "~Shengnan_Wang2",
+      "~Cheng_Yang3",
+      "~Chao_Yang7"
+    ],
+    "rank": 1354
+  },
+  {
+    "url": "https://openreview.net/forum?id=IPGZ6S3LDdw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Sampling-based neural architecture search (NAS) always guarantees better convergence yet suffers from huge computational resources compared with gradient-based approaches, due to the rollout bottleneck -- exhaustive training for each sampled generation on proxy tasks. This work provides a general pipeline to accelerate the convergence of the rollout process as well as the RL learning process in sampling-based NAS. It is motivated by the interesting observation that both the architecture and the parameter knowledge can be transferred between different experiments and even different tasks. We first introduce an uncertainty-aware critic (value function) in PPO to utilize the architecture knowledge in previous experiments, which stabilizes the training process and reduces the searching time by 4 times. Further, a life-long knowledge pool together with a block similarity function is proposed to utilize the lifelong parameter knowledge and reduces the searching time by 2 times. It is the first to introduce block-level weight sharing in RL-based NAS. The block similarity function guarantees a 100% hitting ratio with strict fairness.  Besides, we show a simply designed off-policy correction factor that enables 'replay buffer' in RL optimization and further reduces half of the searching time. Experiments on the MNAS search space show the proposed FNAS accelerates standard RL-based NAS process by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>10x (e.g. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>256 2x2 TPUv2*days / 20,000 GPU*hour <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">\u2192</mo></math></mjx-assistive-mml></mjx-container> 2,000 GPU*hour for MNAS), and guarantees better performance on various vision tasks.",
+    "title": "Fast MNAS: Uncertainty-aware Neural Architecture Search with Lifelong Learning",
+    "authors": [
+      "Jihao Liu",
+      "Yangting Sun",
+      "Ming Zhang",
+      "Boxiao Liu",
+      "Yu Liu"
+    ],
+    "emails": [
+      "~Yu_Liu2",
+      "~Yangting_Sun1",
+      "~Jihao_Liu3",
+      "~Ming_Zhang10",
+      "~Boxiao_Liu1"
+    ],
+    "rank": 1355
+  },
+  {
+    "url": "https://openreview.net/forum?id=pTZ6EgZtzDU",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Meta-reinforcement learning aims at finding a policy able to generalize to new environments. When facing a new environment, this policy must explore to identify its particular characteristics and then exploit this information for collecting reward.  We consider the online adaptation setting where the agent needs to trade-off between the two types of behaviour within the same episode. Even though policies based on recurrent neural networks can be used in this setting by training them on multiple environments, they often fail to model this trade-off, or solve it at a very high computational cost. In this paper, we propose a new algorithm that uses privileged information in the form of a task descriptor at train time to improve the learning of recurrent policies.  Our method learns an informed policy (i.e., a policy receiving as input the description of the current task) that is used to both construct task embeddings from the descriptors, and to regularize the training of the recurrent policy through parameters sharing and an auxiliary objective. This approach significantly reduces the learning sample complexity without altering the representational power of RNNs, by focusing on the relevant characteristics of the task, and by exploiting them efficiently.  We evaluate our algorithm in a variety of environments that require sophisticated exploration/exploitation strategies and show that it outperforms vanilla RNNs, Thompson sampling and the task-inference approaches to meta-reinforcement learning.",
+    "title": "Meta-Reinforcement Learning With Informed Policy Regularization",
+    "authors": [
+      "Pierre-Alexandre Kamienny",
+      "Matteo Pirotta",
+      "Alessandro Lazaric",
+      "Thibault Lavril",
+      "Nicolas Usunier",
+      "Ludovic Denoyer"
+    ],
+    "emails": [
+      "~Nicolas_Usunier1",
+      "~Ludovic_Denoyer1",
+      "~Alessandro_Lazaric2",
+      "~Matteo_Pirotta1",
+      "~Pierre-Alexandre_Kamienny1",
+      "fb.com"
+    ],
+    "rank": 1356
+  },
+  {
+    "url": "https://openreview.net/forum?id=w6p7UMtf-0S",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.46",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose a transductive meta-learning method that uses unlabelled instances to improve few-shot image classification performance. Our approach combines a regularized Mahalanobis-distance-based soft k-means clustering procedure with a modified state of the art neural adaptive feature extractor to achieve improved test-time classification accuracy using unlabelled data. We evaluate our method on transductive few-shot learning tasks, in which the goal is to jointly predict labels for query (test) examples given a set of support (training) examples. We achieve new state of the art performance on the Meta-Dataset and the mini-ImageNet and tiered-ImageNet benchmarks.",
+    "title": "Improving Few-Shot Visual Classification with Unlabelled Examples",
+    "authors": [
+      "Peyman Bateni",
+      "Jarred Barber",
+      "Jan-Willem van de Meent",
+      "Frank Wood"
+    ],
+    "emails": [
+      "~Jan-Willem_van_de_Meent1",
+      "gmail.com",
+      "~Frank_Wood2",
+      "~Peyman_Bateni1"
+    ],
+    "rank": 1357
+  },
+  {
+    "url": "https://openreview.net/forum?id=kGvXK_1qzyy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.46",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Detection of deterioration of agent performance in dynamic environments is challenging due to the non-i.i.d nature of the observed performance. We consider an episodic framework, where the objective is to detect when an agent begins to falter. We devise a hypothesis testing procedure for non-i.i.d rewards, which is optimal under certain conditions. To apply the procedure sequentially in an online manner, we also suggest a novel Bootstrap mechanism for False Alarm Rate control (BFAR). We demonstrate our procedure in problems where the rewards are not independent, nor identically-distributed, nor normally-distributed. The statistical power of the new testing procedure is shown to outperform alternative tests - often by orders of magnitude - for a variety of environment modifications (which cause deterioration in agent performance). Our detection method is entirely external to the agent, and in particular does not require model-based learning. Furthermore, it can be applied to detect changes or drifts in any episodic signal.",
+    "title": "Drift Detection in Episodic Data: Detect When Your Agent Starts Faltering",
+    "authors": [
+      "Ido Greenberg",
+      "Shie Mannor"
+    ],
+    "emails": [
+      "~Ido_Greenberg1",
+      "~Shie_Mannor2"
+    ],
+    "rank": 1358
+  },
+  {
+    "url": "https://openreview.net/forum?id=C3qvk5IQIJY",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.46",
+    "confidences": [
+      3,
+      3,
+      2,
+      5
+    ],
+    "abstract": "A broad class of unsupervised deep learning methods such as Generative Adversarial Networks (GANs) involve training of overparameterized models where the number of parameters of the model exceeds a certain threshold. Indeed, most successful GANs used in practice are trained using overparameterized generator and discriminator networks, both in terms of depth and width. A large body of work in supervised learning have shown the importance of model overparameterization in the convergence of the gradient descent (GD) to globally optimal solutions. In contrast, the unsupervised setting and GANs in particular involve non-convex concave mini-max optimization problems that are often trained using Gradient Descent/Ascent (GDA).\nThe role and benefits of model overparameterization in the convergence of GDA to a global saddle point in non-convex concave problems is far less understood. In this work, we present a comprehensive analysis of the importance of model overparameterization in GANs both theoretically and empirically. We theoretically show that in an overparameterized GAN model with a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn></math></mjx-assistive-mml></mjx-container>-layer neural network generator and a linear discriminator, GDA converges to a global saddle point of the underlying non-convex concave min-max problem. To the best of our knowledge, this is the first result for global convergence of GDA in such settings. Our theory is based on a more general result that holds for a broader class of nonlinear generators and discriminators that obey certain assumptions (including deeper generators and random feature discriminators). Our theory utilizes and builds upon a novel connection with the convergence analysis of linear time-varying dynamical systems which may have broader implications for understanding the convergence behavior of GDA for non-convex concave problems involving overparameterized models. We also empirically study the role of model overparameterization in GANs using several large-scale experiments on CIFAR-10 and Celeb-A datasets. Our experiments show that overparameterization improves the quality of generated samples across various model architectures and datasets. Remarkably, we observe that overparameterization leads to faster and more stable convergence behavior of GDA across the board.",
+    "title": "Understanding Over-parameterization in Generative Adversarial Networks",
+    "authors": [
+      "Yogesh Balaji",
+      "Mohammadmahdi Sajedi",
+      "Neha Mukund Kalibhat",
+      "Mucong Ding",
+      "Dominik St\u00f6ger",
+      "Mahdi Soltanolkotabi",
+      "Soheil Feizi"
+    ],
+    "emails": [
+      "~Dominik_St\u00f6ger1",
+      "~Mucong_Ding1",
+      "~Yogesh_Balaji1",
+      "~Neha_Mukund_Kalibhat1",
+      "~Soheil_Feizi2",
+      "usc.edu",
+      "~Mahdi_Soltanolkotabi1"
+    ],
+    "rank": 1359
+  },
+  {
+    "url": "https://openreview.net/forum?id=CMsvjAnW1zE",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      4
+    ],
+    "rating": "5.46",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this work, we comprehensively reveal the learning dynamics of neural network with normalization, weight decay (WD), and SGD (with momentum), named as Spherical Motion Dynamics (SMD). Most related works study SMD by focusing on \"effective learning rate\" in \"equilibrium\" condition, where weight norm remains unchanged. However, their discussions on why equilibrium condition can be reached in SMD is either absent or less convincing. Our work investigates SMD by directly exploring the cause of equilibrium condition. Specifically, 1) we introduce the assumptions that can lead to equilibrium condition in SMD, and prove that weight norm can converge at linear rate with given assumptions; 2) we propose \"angular update\" as a substitute for effective learning rate to measure the evolving of neural network in SMD, and prove angular update  can also converge to its theoretical value at linear rate; 3) we verify our assumptions and theoretical results on various computer vision tasks including ImageNet and MSCOCO with standard settings. Experiment results show our theoretical findings agree well with empirical observations.",
+    "title": "Spherical Motion Dynamics: Learning Dynamics of Neural Network with Normalization, Weight Decay, and SGD",
+    "authors": [
+      "Ruosi Wan",
+      "Zhanxing Zhu",
+      "Xiangyu Zhang",
+      "Jian Sun"
+    ],
+    "emails": [
+      "~Xiangyu_Zhang1",
+      "~Jian_Sun4",
+      "~Zhanxing_Zhu1",
+      "~Ruosi_Wan4"
+    ],
+    "rank": 1360
+  },
+  {
+    "url": "https://openreview.net/forum?id=O1GEH9X8848",
+    "ratings": [
+      6,
+      4,
+      7
+    ],
+    "rating": "5.46",
+    "confidences": [
+      5,
+      5,
+      3
+    ],
+    "abstract": "We present a probabilistic model for point cloud generation, which is critical for various 3D vision tasks such as shape completion, upsampling, synthesis and data augmentation. Inspired by the diffusion process in non-equilibrium thermodynamics, we view points in point clouds as particles in a thermodynamic system in contact with a heat bath, which diffuse from the original distribution to a noise distribution. Point cloud generation thus amounts to learning the reverse diffusion process that transforms the noise distribution to the distribution of a desired shape. Specifically, we propose to model the reverse diffusion process for point clouds as a Markov chain conditioned on certain shape latent. We derive the variational bound in closed form for training and provide implementations of the model. Experimental results demonstrate that our model achieves the state-of-the-art performance in point cloud generation and auto-encoding.",
+    "title": "A Point Cloud Generative Model Based on Nonequilibrium Thermodynamics",
+    "authors": [
+      "Shitong Luo",
+      "Wei Hu"
+    ],
+    "emails": [
+      "~Shitong_Luo1",
+      "~Wei_Hu6"
+    ],
+    "rank": 1361
+  },
+  {
+    "url": "https://openreview.net/forum?id=RcjRb9pEQ-Q",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      7
+    ],
+    "rating": "5.46",
+    "confidences": [
+      5,
+      3,
+      2,
+      3
+    ],
+    "abstract": "We propose a novel approach for generating unrestricted adversarial examples by manipulating fine-grained aspects of image generation. Unlike existing unrestricted attacks that typically hand-craft geometric transformations, we learn stylistic and stochastic modifications leveraging state-of-the-art generative models. This allows us to manipulate an image in a controlled, fine-grained manner without being bounded by a norm threshold. Our approach can be used for targeted and non-targeted unrestricted attacks on classification, semantic segmentation and object detection models. Our attacks can bypass certified defenses, yet our adversarial images look indistinguishable from natural images as verified by human evaluation. Moreover, we demonstrate that adversarial training with our examples improves performance of the model on clean images without requiring any modifications to the architecture. We perform experiments on LSUN, CelebA-HQ and COCO-Stuff as high resolution datasets to validate efficacy of our proposed approach. ",
+    "title": "Fine-grained Synthesis of Unrestricted Adversarial Examples",
+    "authors": [
+      "Omid Poursaeed",
+      "Tianxing Jiang",
+      "Yordanos Abraham Goshu",
+      "Harry Yang",
+      "Serge Belongie",
+      "Ser-Nam Lim"
+    ],
+    "emails": [
+      "cornell.edu",
+      "~Yordanos_Abraham_Goshu1",
+      "~Serge_Belongie1",
+      "~Omid_Poursaeed2",
+      "fb.com",
+      "~Ser-Nam_Lim3"
+    ],
+    "rank": 1362
+  },
+  {
+    "url": "https://openreview.net/forum?id=qG4ZVCCyCB0",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.45",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Meta-learning aims to perform fast adaptation on a new task through learning a \"prior\" from multiple existing tasks. A common practice in meta-learning is to perform a train-validation split where the prior adapts to the task on one split of the data, and the resulting predictor is evaluated on another split. Despite its prevalence, the importance of the train-validation split is not well understood either in theory or in practice, particularly in comparison to the more direct non-splitting method, which uses all the per-task data for both training and evaluation.\nWe provide a detailed theoretical study on the whether and when the train-validation split is helpful on the linear centroid meta-learning problem, in the asymptotic setting where the number of tasks goes to infinity. We show that the splitting method converges to the optimal prior as expected, whereas the non-splitting method does not in general without structural assumptions on the data. In contrast, if the data are generated from linear models (the realizable regime), we show that both the splitting and non-splitting methods converge to the optimal prior. Further, perhaps surprisingly, our main result shows that the non-splitting method achieves a strictly better asymptotic excess risk under this data distribution, even when the regularization parameter and split ratio are optimally tuned for both methods. Our results highlight that data splitting may not always be preferable, especially when the data is realizable by the model. We validate our theories by experimentally showing that the non-splitting method can indeed outperform the splitting method, on both simulations and real meta-learning tasks.",
+    "title": "How Important is the Train-Validation Split in Meta-Learning?",
+    "authors": [
+      "Yu Bai",
+      "Minshuo Chen",
+      "Pan Zhou",
+      "Tuo Zhao",
+      "Jason D. Lee",
+      "Sham M. Kakade",
+      "Huan Wang",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Minshuo_Chen1",
+      "~Yu_Bai1",
+      "~Pan_Zhou3",
+      "~Sham_M._Kakade1",
+      "~Jason_D._Lee1",
+      "~Huan_Wang1",
+      "~Tuo_Zhao1",
+      "~Caiming_Xiong1"
+    ],
+    "rank": 1363
+  },
+  {
+    "url": "https://openreview.net/forum?id=edku48LG0pT",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.45",
+    "confidences": [
+      2,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Markov Chain Monte Carlo (MCMC) methods sample from unnormalized probability distributions and offer guarantees of exact sampling. However, in the continuous case, unfavorable geometry of the target distribution can greatly limit the efficiency of MCMC methods. Augmenting samplers with neural networks can potentially improve their efficiency. Previous neural network based samplers were trained with objectives that either did not explicitly encourage exploration, or used a L2 jump objective which could only be applied to well structured distributions. Thus it seems promising to instead maximize the proposal entropy for adapting the proposal to distributions of any shape. To allow direct optimization of the proposal entropy, we propose a neural network MCMC sampler that has a flexible and tractable proposal distribution. Specifically, our network architecture utilizes the gradient of the target distribution for generating proposals. Our model achieves significantly higher efficiency than previous neural network MCMC techniques in a variety of sampling tasks. Further, the sampler is applied on training of a convergent energy-based model of natural images. The learned sampler achieves significantly higher proposal entropy and sample quality compared to Langevin dynamics sampler.",
+    "title": "A Neural Network MCMC sampler that maximizes Proposal Entropy",
+    "authors": [
+      "ZENGYI LI",
+      "Yubei Chen",
+      "Friedrich Sommer"
+    ],
+    "emails": [
+      "~Friedrich_Sommer1",
+      "~ZENGYI_LI1",
+      "~Yubei_Chen1"
+    ],
+    "rank": 1364
+  },
+  {
+    "url": "https://openreview.net/forum?id=iVaPuvROtMm",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      4
+    ],
+    "rating": "5.45",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Learning nonlinear dynamics from aggregate data is a challenging problem since the full trajectory of each individual is not available, namely, the individual observed at one time point may not be observed at next time point, or the identity of individual is unavailable. This is in sharp contrast to learning dynamics with trajectory data, on which the majority of existing methods are based. We propose a novel method using the weak form of Fokker Planck Equation (FPE) to describe density evolution of data in a sampling form, which is then combined with Wasserstein generative adversarial network (WGAN) in training process. In such a sample-based framework we are able to study nonlinear dynamics from aggregate data without solving the partial differential equation (PDE). The model can also handle high dimensional cases with the help of deep neural networks. We demonstrate our approach in the context of a series of synthetic and real-world data sets.",
+    "title": "Learning Stochastic Behaviour from Aggregate Data",
+    "authors": [
+      "Shaojun Ma",
+      "Shu Liu",
+      "Hongyuan Zha",
+      "Hao-Min Zhou"
+    ],
+    "emails": [
+      "gatech.edu",
+      "~Hongyuan_Zha1",
+      "~Hao-Min_Zhou1",
+      "~Shaojun_Ma1"
+    ],
+    "rank": 1365
+  },
+  {
+    "url": "https://openreview.net/forum?id=e8W-hsu_q5",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.45",
+    "confidences": [
+      3,
+      2,
+      2,
+      4
+    ],
+    "abstract": "We present the group equivariant conditional neural process (EquivCNP), a meta-learning method with permutation invariance in a data set as in conventional conditional neural processes (CNPs), and it also has transformation equivariance in data space. Incorporating group equivariance, such as rotation and scaling equivariance, provides a way to consider the symmetry of real-world data. We give a decomposition theorem for permutation-invariant and group-equivariant maps, which leads us to construct EquivCNPs with an infinite-dimensional latent space to handle group symmetries. In this paper, we build architecture using Lie group convolutional layers for practical implementation. We show that EquivCNP with translation equivariance achieves comparable performance to conventional CNPs in a 1D regression task. Moreover, we demonstrate that incorporating an appropriate Lie group equivariance, EquivCNP is capable of zero-shot generalization for an image-completion task by selecting an appropriate Lie group equivariance.",
+    "title": "Group Equivariant Conditional Neural Processes",
+    "authors": [
+      "Makoto Kawano",
+      "Wataru Kumagai",
+      "Akiyoshi Sannai",
+      "Yusuke Iwasawa",
+      "Yutaka Matsuo"
+    ],
+    "emails": [
+      "~Wataru_Kumagai2",
+      "~Yusuke_Iwasawa1",
+      "~Makoto_Kawano1",
+      "~Yutaka_Matsuo1",
+      "~Akiyoshi_Sannai1"
+    ],
+    "rank": 1366
+  },
+  {
+    "url": "https://openreview.net/forum?id=wXoHN-Zoel",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.45",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "One of the main barriers to the broader adoption of algorithmic fairness in machine learning is the trade-off between fairness and performance of ML models: many practitioners are unwilling to sacrifice the performance of their ML model for fairness. In this paper, we show that this trade-off may not be necessary. If the algorithmic biases in an ML model are due to sampling biases in the training data, then enforcing algorithmic fairness may improve the performance of the ML model on unbiased test data. We study conditions under which enforcing algorithmic fairness helps practitioners learn the Bayes decision rule for (unbiased) test data from biased training data. We also demonstrate the practical implications of our theoretical results in real-world ML tasks.",
+    "title": "There is no trade-off: enforcing fairness can improve accuracy",
+    "authors": [
+      "Subha Maity",
+      "Debarghya Mukherjee",
+      "Mikhail Yurochkin",
+      "Yuekai Sun"
+    ],
+    "emails": [
+      "~Subha_Maity1",
+      "umich.edu",
+      "~Mikhail_Yurochkin1",
+      "~Yuekai_Sun1"
+    ],
+    "rank": 1367
+  },
+  {
+    "url": "https://openreview.net/forum?id=8wa7HrUsElL",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      6,
+      3
+    ],
+    "rating": "5.45",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Even in simple multi-agent systems, fixed incentives can lead to outcomes that are poor for the group and each individual agent. We propose a method, D3C, for online adjustment of agent incentives that reduces the loss incurred at a Nash equilibrium. Agents adjust their incentives by learning to mix their incentive with that of other agents, until a compromise is reached in a distributed fashion. We show that D3C improves outcomes for each agent and the group as a whole in several social dilemmas including a traffic network with Braess\u2019s paradox, a prisoner\u2019s dilemma, and several reinforcement learning domains.",
+    "title": "D3C: Reducing the Price of Anarchy in Multi-Agent Learning",
+    "authors": [
+      "Ian Gemp",
+      "Kevin McKee",
+      "Richard Everett",
+      "Edgar Alfredo Duenez-Guzman",
+      "Yoram Bachrach",
+      "David Balduzzi",
+      "Andrea Tacchetti"
+    ],
+    "emails": [
+      "~David_Balduzzi1",
+      "~Edgar_Alfredo_Duenez-Guzman1",
+      "~Andrea_Tacchetti1",
+      "~Ian_Gemp1",
+      "google.com",
+      "~Richard_Everett1",
+      "~Yoram_Bachrach2"
+    ],
+    "rank": 1368
+  },
+  {
+    "url": "https://openreview.net/forum?id=SQfqNwVoWu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      4
+    ],
+    "rating": "5.44",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We study the problem of probabilistic inference on the joint distribution defined by a normalizing flow model. Given a pre-trained flow model <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-b mjx-i\"><mjx-c class=\"mjx-c1D499 TEX-BI\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo stretchy=\"false\">(</mo><mi mathvariant=\"bold-italic\">x</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, we wish to estimate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-b mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D499 TEX-BI\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2223\"></mjx-c></mjx-mo><mjx-msub space=\"4\"><mjx-mi class=\"mjx-b mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D499 TEX-BI\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo stretchy=\"false\">(</mo><msub><mi mathvariant=\"bold-italic\">x</mi><mn>2</mn></msub><mo>\u2223</mo><msub><mi mathvariant=\"bold-italic\">x</mi><mn>1</mn></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for some arbitrary partitioning of the variables <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-b mjx-i\"><mjx-c class=\"mjx-c1D499 TEX-BI\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-b mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D499 TEX-BI\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-b mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D499 TEX-BI\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"bold-italic\">x</mi><mo>=</mo><mo stretchy=\"false\">(</mo><msub><mi mathvariant=\"bold-italic\">x</mi><mn>1</mn></msub><mo>,</mo><msub><mi mathvariant=\"bold-italic\">x</mi><mn>2</mn></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. We first show that this task is computationally hard for a large class of flow models.  Motivated by this hardness result, we propose a framework for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">approximate</mtext></math></mjx-assistive-mml></mjx-container> probabilistic inference. Specifically, our method trains a new generative model with the property that its composition with the given model approximates the target conditional distribution. By parametrizing this new distribution as another flow model, we can efficiently train it using variational inference and also handle conditioning under arbitrary differentiable transformations. Since the resulting approximate posterior remains a flow, it offers exact likelihood evaluation, inversion, and efficient sampling. We provide an extensive empirical evidence showcasing the flexibility of our method on a variety of inference tasks with applications to inverse problems. We also experimentally demonstrate that our approach is comparable to simple MCMC baselines in terms of sample quality. Further, we explain the failure of naively applying variational inference and show that our method does not suffer from the same issue.",
+    "title": "Approximate Probabilistic Inference with Composed Flows",
+    "authors": [
+      "Jay Whang",
+      "Erik Lindgren",
+      "Alex Dimakis"
+    ],
+    "emails": [
+      "~Jay_Whang1",
+      "~Alex_Dimakis1",
+      "~Erik_Lindgren1"
+    ],
+    "rank": 1369
+  },
+  {
+    "url": "https://openreview.net/forum?id=agyFqcmgl6y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.44",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "This paper proposes a Disentangled gEnerative cAusal Representation (DEAR) learning method. Unlike existing disentanglement methods that enforce independence of the latent variables, we consider the general case where the underlying factors of interests can be causally correlated. We show that previous methods with independent priors fail to disentangle causally related factors. Motivated by this finding, we propose a new disentangled learning method called DEAR that enables causal controllable generation and causal representation learning. The key ingredient of this new formulation is to use a structural causal model (SCM) as the prior for a bidirectional generative model. A generator is then trained jointly with an encoder using a suitable GAN loss. Theoretical justification on the proposed formulation is provided, which guarantees disentangled causal representation learning under appropriate conditions. We conduct extensive experiments on both synthesized and real datasets to demonstrate the effectiveness of DEAR in causal controllable generation, and the benefits of the learned representations for downstream tasks in terms of sample efficiency and distributional robustness.",
+    "title": "Disentangled Generative Causal Representation Learning",
+    "authors": [
+      "Xinwei Shen",
+      "Furui Liu",
+      "Hanze Dong",
+      "Qing LIAN",
+      "Zhitang Chen",
+      "Tong Zhang"
+    ],
+    "emails": [
+      "~Hanze_Dong1",
+      "~Qing_LIAN3",
+      "~Zhitang_Chen1",
+      "~Tong_Zhang2",
+      "~Furui_Liu1",
+      "~Xinwei_Shen1"
+    ],
+    "rank": 1370
+  },
+  {
+    "url": "https://openreview.net/forum?id=66H4g_OHdnl",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      3,
+      8
+    ],
+    "rating": "5.44",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "We study regularized deep neural networks (DNNs) and introduce a convex analytic framework to characterize the structure of the hidden layers. We show that a set of optimal hidden layer weights for a norm regularized DNN training problem can be explicitly found as the extreme points of a convex set. For the special case of deep linear networks with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> outputs, we prove that each optimal weight matrix is rank-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> and aligns with the previous layers via duality. More importantly, we apply the same characterization to deep ReLU networks with whitened data and prove the same weight alignment holds. As a corollary, we prove that norm regularized deep ReLU networks yield spline interpolation for one-dimensional datasets which was previously known only for two-layer networks. Furthermore, we provide closed-form solutions for the optimal layer weights when data is rank-one or whitened. We then verify our theory via numerical experiments.",
+    "title": "Revealing the Structure of Deep Neural Networks via Convex Duality",
+    "authors": [
+      "Tolga Ergen",
+      "Mert Pilanci"
+    ],
+    "emails": [
+      "~Mert_Pilanci3",
+      "~Tolga_Ergen1"
+    ],
+    "rank": 1371
+  },
+  {
+    "url": "https://openreview.net/forum?id=1yDrpckYHnN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.44",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Self-supervised pre-training of text representations has been successfully applied to low-resource Neural Machine Translation (NMT). However, it usually fails to achieve notable gains on resource-rich NMT. In this paper, we propose a joint training approach, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>F</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-XEnDec, to combine self-supervised and supervised learning to optimize NMT models. To exploit complementary self-supervised signals for supervised learning,  NMT models are trained on examples that are interbred from monolingual and parallel  sentences through a new process called crossover encoder-decoder. Experiments on two resource-rich translation bench-marks, WMT\u201914 English-German and WMT\u201914 English-French, demonstrate that our approach achieves substantial improvements over a vanilla Transformer and obtains a new state of the art of 46 BLEU on English-French. Results also show that our approach is capable of improving model robustness against input perturbations which is known as a key weakness in contemporary NMT systems.",
+    "title": "Self-supervised and Supervised Joint Training for Resource-rich Machine Translation",
+    "authors": [
+      "Yong Cheng",
+      "Wei Wang.",
+      "Lu Jiang",
+      "Wolfgang Macherey"
+    ],
+    "emails": [
+      "~Wei_Wang.1",
+      "~Yong_Cheng3",
+      "~Lu_Jiang1",
+      "~Wolfgang_Macherey1"
+    ],
+    "rank": 1372
+  },
+  {
+    "url": "https://openreview.net/forum?id=_bF8aOMNIdu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.44",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Successful training of deep neural networks with noisy labels is an essential capability as most real-world datasets contain some amount of mislabeled data.  Left unmitigated, label noise can sharply degrade typical supervised learning approaches.  In this paper, we present robust temporal ensembling (RTE), a simple supervised learning approach which combines robust task loss, temporal pseudo-labeling, and a new ensemble consistency regularization term to achieve noise-robust learning.  We demonstrate that RTE achieves state-of-the-art performance across the CIFAR-10, CIFAR-100, and ImageNet datasets, while forgoing the recent trend of label filtering/fixing. In particular, RTE achieves 93.64% accuracy on CIFAR-10 and 66.43% accuracy on CIFAR-100 under 80% label corruption, and achieves 74.79% accuracy on ImageNet under 40% corruption. These are substantial gains over previous state-of-the-art accuracies of 86.6%, 60.2%, and 71.31%, respectively, achieved using three distinct methods. Finally, we show that RTE retains competitive corruption robustness to unforeseen input noise using CIFAR-10-C, obtaining a mean corruption error (mCE) of 13.50% even in the presence of an 80% noise ratio, versus 26.9% mCE with standard methods on clean data.",
+    "title": "Robust Temporal Ensembling",
+    "authors": [
+      "Abel Brown",
+      "Benedikt Schifferer",
+      "Robert DiPietro"
+    ],
+    "emails": [
+      "~Benedikt_Schifferer2",
+      "~Robert_DiPietro1",
+      "~Abel_Brown1"
+    ],
+    "rank": 1373
+  },
+  {
+    "url": "https://openreview.net/forum?id=D62nJAdpijt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.44",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "In this work, we naturally unify adversarial examples and Trojan backdoors into a new stealthy attack, that is activated only when 1) adversarial perturbation is injected into the input examples and 2) a Trojan backdoor is used to poison the training process simultaneously. Different from traditional attacks, we leverage adversarial noise in the input space to move Trojan-infected examples across the model decision boundary, thus making it difficult to be detected. Our attack can fool the user into accidentally trusting the infected model as a robust classifier against adversarial examples. We perform a thorough analysis and conduct an extensive set of experiments on several benchmark datasets to show that our attack can bypass existing defenses with a success rate close to 100%.",
+    "title": "Trojans and Adversarial Examples: A Lethal Combination",
+    "authors": [
+      "Guanxiong Liu",
+      "Issa Khalil",
+      "Abdallah Khreishah",
+      "Hai Phan"
+    ],
+    "emails": [
+      "hbku.edu.qa",
+      "~Guanxiong_Liu1",
+      "~Abdallah_Khreishah1",
+      "~Hai_Phan1"
+    ],
+    "rank": 1374
+  },
+  {
+    "url": "https://openreview.net/forum?id=aNqEm3NyqOg",
+    "ratings": [
+      4,
+      7,
+      5
+    ],
+    "rating": "5.44",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Current deep learning paradigms largely benefit from the tremendous amount of annotated data. However, the quality of the annotations often varies among labelers. Multi-observer studies have been conducted to study these annotation variances (by labeling the same data for multiple times) and its effects on critical applications like medical image analysis. This process indeed adds an extra burden to the already tedious annotation work that usually requires professional training and expertise in the specific domains. On the other hand, automated annotation methods based on NLP algorithms have recently shown promise as a reasonable alternative, relying on the existing diagnostic reports of those images that are widely available in the clinical system. Compared to human labelers, different algorithms provide labels with varying qualities that are even noisier. In this paper, we show how noisy annotations (e.g., from different algorithm-based labelers) can be utilized together and mutually benefit the learning of classification tasks. Specifically, the concept of attention-on-label is introduced to sample better label sets on-the-fly as the training data. A meta-training based label-sampling module is designed to attend the labels that benefit the model learning the most through additional back-propagation processes. We apply the attention-on-label scheme on the classification task of a synthetic noisy CIFAR-10 dataset to prove the concept, and then demonstrate superior results (3-5% increase on average in multiple disease classification AUCs) on the chest x-ray images from a hospital-scale dataset (MIMIC-CXR) and hand-labeled dataset (OpenI) in comparison to regular training paradigms.",
+    "title": "Learning Image Labels On-the-fly for Training Robust Classification Models",
+    "authors": [
+      "Xiaosong Wang",
+      "Ziyue Xu",
+      "Dong Yang",
+      "Leo K Tam",
+      "Holger R Roth",
+      "Daguang Xu"
+    ],
+    "emails": [
+      "~Dong_Yang1",
+      "~Daguang_Xu2",
+      "~Ziyue_Xu1",
+      "~Leo_K_Tam1",
+      "~Holger_R_Roth1",
+      "~Xiaosong_Wang1"
+    ],
+    "rank": 1375
+  },
+  {
+    "url": "https://openreview.net/forum?id=uVnhiRaW3J",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.44",
+    "confidences": [
+      3,
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Reinforcement Learning (RL) with safety guarantee is critical for agents performing tasks in risky environments. Recent safe RL algorithms, developed based on Constrained Markov Decision Process (CMDP), mostly take the safety requirement as additional constraints when learning to maximize the return. However, they usually make unnecessary compromises in return for safety and only learn sub-optimal policies, due to the inability of differentiating safe and unsafe state-actions with high rewards. To address this, we propose Cost-sensitive Advantage Estimation (CSAE), which is simple to deploy for policy optimization and effective for guiding the agents to avoid unsafe state-actions by penalizing their advantage value properly. Moreover, for stronger safety guarantees, we develop a Worst-case Constrained Markov Decision Process (WCMDP) method to augment CMDP by constraining the worst-case safety cost instead of the average one. With CSAE and WCMDP, we develop new safe RL algorithms with theoretical justifications on their benefits for safety and performance of the obtained policies. Extensive experiments clearly demonstrate the superiority of our algorithms in learning safer and better agents under multiple settings. ",
+    "title": "Learning Safe Policies with Cost-sensitive Advantage Estimation",
+    "authors": [
+      "Bingyi Kang",
+      "Shie Mannor",
+      "Jiashi Feng"
+    ],
+    "emails": [
+      "~Bingyi_Kang1",
+      "~Jiashi_Feng1",
+      "~Shie_Mannor2"
+    ],
+    "rank": 1376
+  },
+  {
+    "url": "https://openreview.net/forum?id=xfNotLXwtQb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.44",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Collaborative filtering has shown great power in predicting potential user-item ratings by factorizing an observed user-item rating matrix into products of two sets of latent factors. However, the user-specific latent factors can only be learned in transductive setting and a model trained on existing users cannot adapt to new users without retraining the model. In this paper, we propose an inductive collaborative filtering framework that learns a hidden relational graph among users from the rating matrix. We first consider a base matrix factorization model trained on one group of users' ratings and devise a relation inference model that estimates their underlying relations (as dense weighted graphs) to other users with respect to historical rating patterns. The relational graphs enable attentive message passing from users to users in the latent space and are updated in end-to-end manner. The key advantage of our model is the capability for inductively computing user-specific representations using no feature, with good scalability and superior expressiveness compared to other feature-driven inductive models. Extensive experiments demonstrate that our model achieves state-of-the-art performance for inductive learning on several matrix completion benchmarks, provides very close performance to transductive models when given many training ratings and exceeds them significantly on cold-start users.",
+    "title": "Inductive Collaborative Filtering via Relation Graph Learning",
+    "authors": [
+      "Qitian Wu",
+      "Hengrui Zhang",
+      "Xiaofeng Gao",
+      "Hongyuan Zha"
+    ],
+    "emails": [
+      "cs.sjtu.edu.cn",
+      "~Hongyuan_Zha1",
+      "sjtu.edu.cn",
+      "~Qitian_Wu1"
+    ],
+    "rank": 1377
+  },
+  {
+    "url": "https://openreview.net/forum?id=B8fp0LVMHa",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.43",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Off-policy reinforcement learning (RL) holds the promise of sample-efficient learning of decision-making policies by leveraging past experience. However, in the offline RL setting -- where a fixed collection of interactions are provided and no further interactions are allowed -- it has been shown that standard off-policy RL methods can significantly underperform. Recently proposed methods often aim to address this shortcoming by constraining learned policies to remain close to the given dataset of interactions. In this work, we closely investigate an important simplification of BCQ~\\citep{fujimoto2018off} -- a prior approach for offline RL -- which removes a heuristic design choice and naturally restrict extracted policies to remain \\emph{exactly} within the support of a given behavior policy. Importantly, in contrast to their original theoretical considerations, we derive this simplified algorithm through the introduction of a novel backup operator, Expected-Max Q-Learning (EMaQ), which is more closely related to the resulting practical algorithm. Specifically, in addition to the distribution support, EMaQ explicitly considers the number of samples and the proposal distribution, allowing us to derive new sub-optimality bounds which can serve as a novel measure of complexity for offline RL problems. In the offline RL setting -- the main focus of this work -- EMaQ matches and outperforms prior state-of-the-art in the D4RL benchmarks~\\citep{fu2020d4rl}. In the online RL setting, we demonstrate that EMaQ is competitive with Soft Actor Critic (SAC). The key contributions of our empirical findings are demonstrating the importance of careful generative model design for estimating behavior policies, and an intuitive notion of complexity for offline RL problems. With its simple interpretation and fewer moving parts, such as no explicit function approximator representing the policy, EMaQ serves as a strong yet easy to implement baseline for future work.",
+    "title": "EMaQ: Expected-Max Q-Learning Operator for Simple Yet Effective Offline and Online RL",
+    "authors": [
+      "Seyed Kamyar Seyed Ghasemipour",
+      "Dale Schuurmans",
+      "Shixiang Gu"
+    ],
+    "emails": [
+      "~Dale_Schuurmans1",
+      "~Seyed_Kamyar_Seyed_Ghasemipour1",
+      "~Shixiang_Gu1"
+    ],
+    "rank": 1378
+  },
+  {
+    "url": "https://openreview.net/forum?id=UmrVpylRExB",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      8,
+      4,
+      4
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we target an important issue of deep convolutional neural networks (CNNs) \u2014 the lack of a mathematical understanding of their properties. We present an explicit formalism that is motivated by the similarities between trained CNN kernels and oriented Gabor filters for addressing this problem. The core idea is to constrain the behavior of convolutional layers by splitting them into a succession of wavelet packet decompositions, which are modulated by freely-trained mixture weights. We evaluate our approach with three variants of wavelet decompositions with the AlexNet architecture for image classification as an example. The first variant relies on the separable wavelet packet transform while the other two implement the 2D dual-tree real and complex wavelet packet transforms, taking advantage of their feature extraction properties such as directional selectivity and shift invariance. Our experiments show that we achieve the accuracy rate of standard AlexNet, but with a significantly lower number of parameters, and an interpretation of the network that is grounded in mathematical theory.",
+    "title": "Dual-Tree Wavelet Packet CNNs for Image Classification",
+    "authors": [
+      "Hubert Leterme",
+      "K\u00e9vin Polisano",
+      "Val\u00e9rie Perrier",
+      "Karteek Alahari"
+    ],
+    "emails": [
+      "grenoble-inp.fr",
+      "univ-grenoble-alpes.fr",
+      "~Hubert_Leterme1",
+      "~Karteek_Alahari1"
+    ],
+    "rank": 1379
+  },
+  {
+    "url": "https://openreview.net/forum?id=WGWzwdjm8mS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Validation-based early-stopping methods are one of the most popular techniques used to avoid over-training deep neural networks. They require to set aside a reliable unbiased validation set, which can be expensive in applications offering limited amounts of data. In this paper, we propose to use \\emph{gradient disparity}, which we define as the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> norm distance between the gradient vectors of two batches drawn from the training set. It comes from a probabilistic upper bound on the difference between the classification errors over a given batch, when the network is trained on this batch and when the network is trained on another batch of points sampled from the same dataset. We empirically show that gradient disparity is a very promising early-stopping criterion when data is limited, because it uses all the training samples during training. Furthermore, we show in a wide range of experimental settings that gradient disparity is not only strongly related to the usual generalization error between the training and test sets, but that it is also much more informative about the level of label noise.  ",
+    "title": "Early Stopping by Gradient Disparity",
+    "authors": [
+      "mahsa forouzesh",
+      "Patrick Thiran"
+    ],
+    "emails": [
+      "~Patrick_Thiran1",
+      "~mahsa_forouzesh1"
+    ],
+    "rank": 1380
+  },
+  {
+    "url": "https://openreview.net/forum?id=1wtC_X12XXC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      7,
+      4
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "The backpropagation of error algorithm (backprop) has been instrumental in the recent success of deep learning. However, a key question remains as to whether backprop can be formulated in a manner suitable for implementation in neural circuitry. The primary challenge is to ensure that any candidate formulation uses only local information, rather than relying on global signals as in standard backprop. Recently several algorithms for approximating backprop using only local signals have been proposed. However, these algorithms typically impose other requirements which challenge biological plausibility: for example, requiring complex and precise connectivity schemes, or multiple sequential backwards phases with information being stored across phases. Here, we propose a novel algorithm, Activation Relaxation (AR), which is motivated by constructing the backpropagation gradient as the equilibrium point of a dynamical system. Our algorithm converges rapidly and robustly to the correct backpropagation gradients, requires only a single type of computational unit, utilises only a single parallel backwards relaxation phase, and can operate on arbitrary computation graphs. We illustrate these properties by training deep neural networks on visual classification tasks, and describe simplifications to the algorithm which remove further obstacles to neurobiological implementation (for example, the weight-transport problem, and the use of nonlinear derivatives), while preserving performance.",
+    "title": "Activation Relaxation: A Local Dynamical Approximation to Backpropagation in the Brain",
+    "authors": [
+      "Beren Millidge",
+      "Alexander Tschantz",
+      "Anil K Seth",
+      "Christopher Buckley"
+    ],
+    "emails": [
+      "~Anil_K_Seth1",
+      "~Christopher_Buckley1",
+      "~Alexander_Tschantz1",
+      "~Beren_Millidge1"
+    ],
+    "rank": 1381
+  },
+  {
+    "url": "https://openreview.net/forum?id=1eKz1kjHO1",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Visual context is versatile and hard to describe or label precisely.  We aim to leverage the densely labeled task, image parsing, a.k.a panoptic segmentation, to learn a model that encodes and discovers object-centric context.  Most existing approaches based on deep learning tackle image parsing via fusion of pixel-wise classification and instance masks from two sub-networks.  Such approaches isolate things from stuff and fuse the semantic and instance masks in the later stage.   To encode object-centric context inherently, we propose a metric learning framework, Panoptic Segment Sorting, that is directly trained with stuff and things jointly.  Our key insight is to make the panoptic embeddings separate every instance so that the model automatically learns to leverage visual context as many instances across different images appear similar.  We show that the context of our model's retrieved instances is more consistent relatively by 13.7%, further demonstrating its ability to discover novel context unsupervisedly.  Our overall framework also achieves competitive performance across standard panoptic segmentation metrics amongst the state-of-the-art methods on two large datasets, Cityscapes and PASCAL VOC.  These promising results suggest that pixel-wise embeddings can not only inject new understanding into panoptic segmentation but potentially serve for other tasks such as modeling instance relationships.",
+    "title": "Contextual Image Parsing via Panoptic Segment Sorting",
+    "authors": [
+      "Jyh-Jing Hwang",
+      "Tsung-Wei Ke",
+      "Stella Yu"
+    ],
+    "emails": [
+      "~Tsung-Wei_Ke2",
+      "~Jyh-Jing_Hwang1",
+      "~Stella_Yu2"
+    ],
+    "rank": 1382
+  },
+  {
+    "url": "https://openreview.net/forum?id=xboZWqM_ELA",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      8
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Most existing Graph Neural Networks (GNNs) are proposed without considering the selection bias in data, i.e., the inconsistent distribution between the training set with test set. In reality, the test data is not even available during the training process, making selection bias agnostic. Training GNNs with biased selected nodes leads to significant parameter estimation bias and greatly impacts the generalization ability on test nodes. In this paper, we first present an experimental investigation, which clearly shows that the selection bias drastically hinders the generalization ability of GNNs, and theoretically prove that the selection bias will cause the biased estimation on GNN parameters. Then to remove the bias in GNN estimation, we propose a novel Debiased Graph Neural Networks (DGNN) with a differentiated decorrelation regularizer. The differentiated decorrelation regularizer estimates a sample weight for each labeled node such that the spurious correlation of learned embeddings could be eliminated. We analyze the regularizer in causal view and it motivates us to differentiate the weights of the variables based on their contribution on the confounding bias. Then, these sample weights are used for reweighting GNNs to eliminate the estimation bias, thus help to improve the stability of prediction on unknown test nodes. Comprehensive experiments are conducted on several challenging graph datasets with two kinds of label selection bias. The results well verify that our proposed model outperforms the state-of-the-art methods and DGNN is a flexible framework to enhance existing GNNs.",
+    "title": "Debiased Graph Neural Networks with Agnostic Label Selection Bias",
+    "authors": [
+      "Shaohua Fan",
+      "Xiao Wang",
+      "Chuan Shi",
+      "Kun Kuang",
+      "Nian Liu",
+      "Bai Wang"
+    ],
+    "emails": [
+      "~Kun_Kuang1",
+      "~Shaohua_Fan1",
+      "bupt.edu.cn"
+    ],
+    "rank": 1383
+  },
+  {
+    "url": "https://openreview.net/forum?id=CU0APx9LMaL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      7,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.43",
+    "confidences": [
+      5,
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Powered by innovations in novel architecture design, noise tolerance techniques and increasing model capacity, Automatic Speech Recognition (ASR) has made giant strides in reducing word-error-rate over the past decade. ASR models are often trained with tens of thousand hours of high quality speech data to produce state-of-the-art (SOTA) results. Industry-scale ASR model training thus remains computationally heavy and time-consuming, and consequently has attracted little attention in adopting automatic techniques. On the other hand, Neural Architecture Search (NAS) has gained a lot of interest in the recent years thanks to its successes in discovering efficient architectures, often outperforming handcrafted alternatives. However, by changing the standard training process into a bi-level optimisation problem, NAS approaches often require significantly more time and computational power compared to single-model training, and at the same time increase complexity of the overall process. As a result, NAS has been predominately applied to problems which do not require as extensive training as ASR, and even then reproducibility of NAS algorithms is often problematic. Lately, a number of benchmark datasets has been introduced to address reproducibility issues by pro- viding NAS researchers with information about performance of different models obtained through exhaustive evaluation. However, these datasets focus mainly on computer vision and NLP tasks and thus suffer from limited coverage of application domains. In order to increase diversity in the existing NAS benchmarks, and at the same time provide systematic study of the effects of architectural choices for ASR, we release NAS-Bench-ASR \u2013 the first NAS benchmark for ASR models. The dataset consists of 8, 242 unique models trained on the TIMIT audio dataset for three different target epochs, and each starting from three different initializations. The dataset also includes runtime measurements of all the models on a diverse set of hardware platforms. Lastly, we show that identified good cell structures in our search space for TIMIT transfer well to a much larger LibriSpeech dataset.",
+    "title": "NAS-Bench-ASR: Reproducible Neural Architecture Search for Speech Recognition",
+    "authors": [
+      "Abhinav Mehrotra",
+      "Alberto Gil C. P. Ramos",
+      "Sourav Bhattacharya",
+      "\u0141ukasz Dudziak",
+      "Ravichander Vipperla",
+      "Thomas Chau",
+      "Mohamed S Abdelfattah",
+      "Samin Ishtiaq",
+      "Nicholas Donald Lane"
+    ],
+    "emails": [
+      "~\u0141ukasz_Dudziak1",
+      "~Sourav_Bhattacharya1",
+      "~Mohamed_S_Abdelfattah1",
+      "~Ravichander_Vipperla1",
+      "~Nicholas_Donald_Lane1",
+      "~Abhinav_Mehrotra1",
+      "samsung.com"
+    ],
+    "rank": 1384
+  },
+  {
+    "url": "https://openreview.net/forum?id=ghKbryXRRAB",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "The state of the art of language models, previously dominated by pre-trained word embeddings, is now being pushed forward by large pre-trained contextual representations. This success has driven growing interest to understand what these models encode inside their inner workings. Despite this, understanding their semantic skills has been elusive, often leading to unsuccessful, non-conclusive, or contradictory results among different works. In this work, we define a probing classifier that we use to extract the underlying knowledge graph of nine of the currently most influential language models, including word embeddings, context encoders, and text generators. This probe is based on concept relatedness, grounded on WordNet. Our results show that this knowledge is present in all the models, but has several inaccuracies. Furthermore, we show that the different pre-training strategies and architectures lead to different model biases. We conduct a systematic evaluation to discover specific factors that explain why some concepts are challenging for the different families of models. We hope our insights will motivate the future development of models that capture concepts more precisely.",
+    "title": "Tracking the progress of Language Models by extracting their underlying Knowledge Graphs",
+    "authors": [
+      "Carlos Aspillaga",
+      "Marcelo Mendoza",
+      "Alvaro Soto"
+    ],
+    "emails": [
+      "~Alvaro_Soto1",
+      "~Marcelo_Mendoza1",
+      "~Carlos_Aspillaga1"
+    ],
+    "rank": 1385
+  },
+  {
+    "url": "https://openreview.net/forum?id=33TBJachvOX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Adversarial robustness of machine learning models has attracted considerable attention over recent years. Adversarial attacks undermine the reliability of and trust in machine learning models, but the construction of more robust models hinges on a rigorous understanding of adversarial robustness as a property of a given model. Point-wise measures for specific threat models are currently the most popular tool for comparing the robustness of classifiers and are used in most recent publications on adversarial robustness. In this work, we use robustness curves to show that point-wise measures fail to capture important global properties that are essential to reliably compare the robustness of different classifiers. We introduce new ways in which robustness curves can be used to systematically uncover these properties and provide concrete recommendations for researchers and practitioners when assessing and comparing the robustness of trained models. Furthermore, we characterize scale as a way to distinguish small and large perturbations, and relate it to inherent properties of data sets, demonstrating that robustness thresholds must be chosen accordingly. We hope that our work contributes to a shift of focus away from point-wise measures of robustness and towards a discussion of the question what kind of robustness could and should reasonably be expected. We release code to reproduce all experiments presented in this paper, which includes a Python module to calculate robustness curves for arbitrary data sets and classifiers, supporting a number of frameworks, including TensorFlow, PyTorch and JAX.",
+    "title": "How to compare adversarial robustness of classifiers from a global perspective",
+    "authors": [
+      "Niklas Risse",
+      "Jan Philip G\u00f6pfert",
+      "Christina G\u00f6pfert"
+    ],
+    "emails": [
+      "techfak.uni-bielefeld.de",
+      "~Christina_G\u00f6pfert1",
+      "~Niklas_Risse1"
+    ],
+    "rank": 1386
+  },
+  {
+    "url": "https://openreview.net/forum?id=gdtGg1hCK2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Despite several important advances in recent years, learning causal structures represented by directed acyclic graphs (DAGs) remains a challenging task in high dimensional settings when the graphs to be learned are not sparse.  In this paper, we propose to exploit a low rank assumption regarding the (weighted) adjacency matrix of a DAG causal model to mitigate this problem. We demonstrate how to adapt existing methods for causal structure learning to take advantage of this assumption and establish several useful results relating interpretable graphical conditions to the low rank assumption. In particular, we show that the maximum rank is highly related to hubs, suggesting that scale-free networks which are frequently encountered in real applications tend to be low rank. We also provide empirical evidence for the utility of our low rank adaptations, especially on relatively large and dense graphs. Not only do they outperform existing algorithms when the low rank condition is satisfied, the performance is also  competitive even though the rank of the underlying  DAG may not be as low as is assumed.",
+    "title": "On Low Rank Directed Acyclic Graphs and Causal Structure Learning",
+    "authors": [
+      "Zhuangyan Fang",
+      "Shengyu Zhu",
+      "Jiji Zhang",
+      "Yue Liu",
+      "Zhitang Chen",
+      "Yangbo He"
+    ],
+    "emails": [
+      "~Zhuangyan_Fang1",
+      "~Jiji_Zhang1",
+      "pku.edu.cn",
+      "~Shengyu_Zhu1",
+      "huawei.com",
+      "~Zhitang_Chen1"
+    ],
+    "rank": 1387
+  },
+  {
+    "url": "https://openreview.net/forum?id=sAX7Z7uIJ_Y",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Ambiguities in images or unsystematic annotation can lead to multiple valid solutions in semantic segmentation. To learn a distribution over predictions, recent work has explored the use of probabilistic networks. However, these do not necessarily capture the empirical distribution accurately. In this work, we aim to learn a calibrated multimodal predictive distribution, where the empirical frequency of the sampled predictions closely reflects that of the corresponding labels in the training set. To this end, we propose a novel two-stage, cascaded strategy for calibrated adversarial refinement. In the first stage, we explicitly model the data with a categorical likelihood. In the second, we train an adversarial network to sample from it an arbitrary number of coherent predictions. The model can be used independently or integrated into any black-box segmentation framework to facilitate learning of calibrated stochastic mappings. We demonstrate the utility and versatility of the approach by attaining state-of-the-art results on the multigrader LIDC dataset and a modified Cityscapes dataset. In addition, we use a toy regression dataset to show that our framework is not confined to semantic segmentation, and the core design can be adapted to other tasks requiring learning a calibrated predictive distribution.",
+    "title": "Calibrated Adversarial Refinement for Stochastic Semantic Segmentation",
+    "authors": [
+      "Elias Kassapis",
+      "Georgi Dikov",
+      "Deepak Gupta",
+      "Cedric Nugteren"
+    ],
+    "emails": [
+      "~Deepak_Gupta2",
+      "~Georgi_Dikov1",
+      "~Elias_Kassapis1",
+      "~Cedric_Nugteren1"
+    ],
+    "rank": 1388
+  },
+  {
+    "url": "https://openreview.net/forum?id=xF5r3dVeaEl",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      7,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Modelling the behaviours of other agents (opponents) is essential for understanding how agents interact and making effective decisions. Existing methods for opponent modelling commonly assume knowledge of the local observations and chosen actions of the modelled opponents, which can significantly limit their applicability. We propose a new modelling technique based on variational autoencoders, which are trained to reconstruct the local actions and observations of the opponent based on embeddings which depend only on the local observations of the modelling agent (its observed world state, chosen actions, and received rewards). The embeddings are used to augment the modelling agent's decision policy which is trained via deep reinforcement learning; thus the policy does not require access to opponent observations.  We provide a comprehensive evaluation and ablation study in diverse multi-agent tasks, showing that our method achieves comparable performance to an ideal baseline which has full access to opponent's information, and significantly higher returns than a baseline method which does not use the learned embeddings.",
+    "title": "Local Information Opponent Modelling Using Variational Autoencoders",
+    "authors": [
+      "Georgios Papoudakis",
+      "Filippos Christianos",
+      "Stefano V Albrecht"
+    ],
+    "emails": [
+      "~Georgios_Papoudakis1",
+      "ed.ac.uk",
+      "~Stefano_V_Albrecht1"
+    ],
+    "rank": 1389
+  },
+  {
+    "url": "https://openreview.net/forum?id=0xdQXkz69x9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Few-shot learning systems, especially those based on meta-learning, have recently made significant advances, and are now being considered for real world problems in healthcare, personalization, and science. In this paper, we examine the robustness of such deployed few-shot learning systems when they are fed an imperceptibly perturbed few-shot dataset, showing that the resulting predictions on test inputs can become worse than chance. This is achieved by developing a novel Adversarial Support Set Attack which crafts a poisoned set of examples. When even a small subset of malicious data points is inserted into the support set of a meta-learner, accuracy is significantly reduced. For example, the average classification accuracy of CNAPs on the Aircraft dataset in the META-DATASET benchmark drops from 69.2% to 9.1% when only 20% of the support set is poisoned by imperceptible perturbations. We evaluate the new attack on a variety of few-shot classification algorithms including MAML, prototypical networks, and CNAPs, on both small scale (miniImageNet) and large scale (META-DATASET) few-shot classification problems. Interestingly, adversarial support sets produced by attacking a meta-learning based few-shot classifier can also reduce the accuracy of a fine-tuning based few-shot classifier when both models use similar feature extractors.",
+    "title": "Attacking Few-Shot Classifiers with Adversarial Support Sets",
+    "authors": [
+      "Elre Talea Oldewage",
+      "John F Bronskill",
+      "Richard E Turner"
+    ],
+    "emails": [
+      "~Elre_Talea_Oldewage1",
+      "~Richard_E_Turner1",
+      "~John_F_Bronskill1"
+    ],
+    "rank": 1390
+  },
+  {
+    "url": "https://openreview.net/forum?id=JYVODnDjU20",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper we present SemSAD, a simple and generic framework for detecting examples that lie out-of-distribution (OOD) for a given training set. The approach is based on learning a semantic similarity measure to find for a given test example the semantically closest example in the training set and then using a discriminator to classify whether the two  examples show sufficient semantic dissimilarity such that the test example can be rejected as OOD.  We are able to outperform previous approaches for anomaly, novelty, or out-of-distribution detection in the visual domain by a large margin. In particular we obtain AUROC values close to one for the challenging task of detecting examples from CIFAR-10 as out-of-distribution given CIFAR-100 as in-distribution, without making use of label information. ",
+    "title": "UNSUPERVISED ANOMALY DETECTION FROM SEMANTIC SIMILARITY SCORES",
+    "authors": [
+      "Nima Rafiee",
+      "Rahil Gholamipoor",
+      "Markus Kollmann"
+    ],
+    "emails": [
+      "~Markus_Kollmann1",
+      "hhu.de"
+    ],
+    "rank": 1391
+  },
+  {
+    "url": "https://openreview.net/forum?id=dqyK5RKMaW4",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      2,
+      5,
+      3
+    ],
+    "abstract": "The use of deep learning has grown at an exponential rate, giving rise to numerous specialized hardware and software systems for deep learning. Because the design space of deep learning software stacks and hardware accelerators is diverse and vast, prior work considers software optimizations separately from hardware architectures, effectively reducing the search space.  Unfortunately, this bifurcated approach means that many profitable design points are never explored. This paper instead casts the problem as hardware/software co-design, with the goal of automatically identifying desirable points in the joint design space. The key to our solution is a new constrained Bayesian optimization framework that avoids invalid solutions by exploiting the highly constrained features of this design space, which are semi-continuous/semi-discrete. We evaluate our optimization framework by applying it to a variety of neural models, improving the energy-delay product by 18% (ResNet) and 40% (DQN) over hand-tuned state-of-the-art systems, as well as demonstrating strong results on other neural network architectures, such as MLPs and Transformers.",
+    "title": "LEARNED HARDWARE/SOFTWARE CO-DESIGN OF NEURAL ACCELERATORS",
+    "authors": [
+      "Zhan Shi",
+      "Chirag Sakhuja",
+      "Milad Hashemi",
+      "Kevin Swersky",
+      "Calvin Lin"
+    ],
+    "emails": [
+      "~Calvin_Lin1",
+      "utexas.edu",
+      "~Milad_Hashemi1",
+      "~Zhan_Shi3",
+      "~Kevin_Swersky1"
+    ],
+    "rank": 1392
+  },
+  {
+    "url": "https://openreview.net/forum?id=bJLHjvYV1Cu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Metalearning of deep neural network (DNN) architectures and hyperparameters has become an increasingly important area of research. Loss functions are a type of metaknowledge that is crucial to effective training of DNNs, however, their potential role in metalearning has not yet been fully explored. Whereas early work focused on genetic programming (GP) on tree representations, this paper proposes continuous CMA-ES optimization of multivariate Taylor polynomial parameterizations. This approach, TaylorGLO, makes it possible to represent and search useful loss functions more effectively. In MNIST, CIFAR-10, and SVHN benchmark tasks, TaylorGLO finds new loss functions that outperform functions previously discovered through GP, as well as the standard cross-entropy loss, in fewer generations. These functions serve to regularize the learning task by discouraging overfitting to the labels, which is particularly useful in tasks where limited training data is available. The results thus demonstrate that loss function optimization is a productive new avenue for metalearning.",
+    "title": "Optimizing Loss Functions Through Multivariate Taylor Polynomial Parameterization",
+    "authors": [
+      "Santiago Gonzalez",
+      "Risto Miikkulainen"
+    ],
+    "emails": [
+      "~Santiago_Gonzalez1",
+      "~Risto_Miikkulainen1"
+    ],
+    "rank": 1393
+  },
+  {
+    "url": "https://openreview.net/forum?id=eYgI3cTPTq9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.43",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Text-based games are long puzzles or quests, characterized by a sequence of sparse and potentially deceptive rewards. They provide an ideal platform to develop agents that perceive and act upon the world using a combinatorially sized natural language state-action space. Standard Reinforcement Learning agents are poorly equipped to effectively explore such spaces and often struggle to overcome bottlenecks---states that agents are unable to pass through simply because they do not see the right action sequence enough times to be sufficiently reinforced. We introduce Q*BERT, an agent that learns to build a knowledge graph of the world by answering questions, which leads to greater sample efficiency. To overcome bottlenecks, we further introduce MC!Q*BERT an agent that uses an knowledge-graph-based intrinsic motivation to detect bottlenecks and a novel exploration strategy to efficiently learn a chain of policy modules to overcome them. We present an ablation study and results demonstrating how our method outperforms the current state-of-the-art on nine text games, including the popular game, Zork, where, for the first time, a learning agent gets past the bottleneck where the player is eaten by a Grue.",
+    "title": "How to Avoid Being Eaten by a Grue: Structured Exploration Strategies for Textual Worlds",
+    "authors": [
+      "Prithviraj Ammanabrolu",
+      "Ethan Tien",
+      "Matthew Hausknecht",
+      "Mark Riedl"
+    ],
+    "emails": [
+      "~Matthew_Hausknecht1",
+      "~Ethan_Tien1",
+      "~Prithviraj_Ammanabrolu1",
+      "~Mark_Riedl1"
+    ],
+    "rank": 1394
+  },
+  {
+    "url": "https://openreview.net/forum?id=jQSBcVURlpW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      6
+    ],
+    "rating": "5.42",
+    "confidences": [
+      4,
+      4,
+      1,
+      3
+    ],
+    "abstract": "Is intelligence realized by connectionist or classicist? While connectionist approaches have achieved superhuman performance, there has been growing evidence that such task-specific superiority is particularly fragile in systematic generalization. This observation lies in the central debate (Fodor et al., 1988; Fodor &amp;McLaughlin, 1990) between connectionist and classicist, wherein the latter continually advocates an algebraic treatment in cognitive architectures. In this work, we follow the classicist's call and propose a hybrid approach to improve systematic generalization in reasoning. Specifically, we showcase a prototype with algebraic representations for the abstract spatial-temporal reasoning task of Raven\u2019s Progressive Matrices (RPM) and present the ALgebra-Aware Neuro-Semi-Symbolic (ALANS<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>) learner. The ALANS<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> learner is motivated by abstract algebra and the representation theory. It consists of a neural visual perception frontend and an algebraic abstract reasoning backend: the frontend summarizes the visual information from object-based representations, while the backend transforms it into an algebraic structure and induces the hidden operator on-the-fly. The induced operator is later executed to predict the answer's representation, and the choice most similar to the prediction is selected as the solution. Extensive experiments show that by incorporating an algebraic treatment, the ALANS<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> learner outperforms various pure connectionist models in domains requiring systematic generalization. We further show that the algebraic representation learned can be decoded by isomorphism and used to generate an answer.",
+    "title": "Learning Algebraic Representation for Abstract Spatial-Temporal Reasoning",
+    "authors": [
+      "Chi Zhang",
+      "Sirui Xie",
+      "Baoxiong Jia",
+      "Yixin Zhu",
+      "Ying Nian Wu",
+      "Song-Chun Zhu"
+    ],
+    "emails": [
+      "~Song-Chun_Zhu1",
+      "~Yixin_Zhu1",
+      "~Sirui_Xie1",
+      "~Baoxiong_Jia1",
+      "~Ying_Nian_Wu1",
+      "~Chi_Zhang12"
+    ],
+    "rank": 1395
+  },
+  {
+    "url": "https://openreview.net/forum?id=I6QHpMdZD5k",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.42",
+    "confidences": [
+      4,
+      5,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Semiconductor device simulation uses numerical analysis, where a set of coupled nonlinear partial differential equations is solved with the iterative Newton-Raphson method. Since an appropriate initial guess to start the Newton-Raphson method is not available, a solution of practical importance with desired boundary conditions cannot be trivially achieved. Instead, several solutions with intermediate boundary conditions should be calculated to address the nonlinearity and introducing intermediate boundary conditions significantly increases the computation time. In order to accelerate the semiconductor device simulation, we propose to use a neural network to learn an approximate solution for desired boundary conditions. With an initial solution sufficiently close to the final one by a trained neural network, computational cost to calculate several unnecessary solutions is significantly reduced. Specifically, a convolutional neural network for MOSFET (Metal-Oxide-Semiconductor Field-Effect Transistor), the most widely used semiconductor device, are trained in a supervised manner to compute the initial solution. Particularly, we propose to consider device grids with varying size and spacing and derive a compact expression of the solution based upon the electrostatic potential. We empirically show that the proposed method accelerates the simulation by more than 12 times. Results from the local linear regression and a fully-connected network are compared and extension to a complex two-dimensional domain is sketched.",
+    "title": "Learning to Solve Nonlinear Partial Differential Equation Systems To Accelerate MOSFET Simulation",
+    "authors": [
+      "Seungcheol Han",
+      "Jonghyun Choi",
+      "Sung-Min Hong"
+    ],
+    "emails": [
+      "~Sung-Min_Hong1",
+      "~Seungcheol_Han1",
+      "~Jonghyun_Choi1"
+    ],
+    "rank": 1396
+  },
+  {
+    "url": "https://openreview.net/forum?id=SXoheAR0Gz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.42",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Given a time-series vector, how can we efficiently compute a specified part of Fourier coefficients? Fast Fourier transform (FFT) is a widely used algorithm that computes the discrete Fourier transform in many machine learning applications. Despite the pervasive use, FFT algorithms do not provide a fine-tuning option for the user to specify one\u2019s demand, that is, the output size (the number of Fourier coefficients to be computed) is algorithmically determined by the input size. Such a lack of flexibility is often followed by just discarding the unused coefficients because many applications do not require the whole spectrum of the frequency domain, resulting in an inefficiency due to the extra computation. \nIn this paper, we propose a fast Partial Fourier Transform (PFT), an efficient algorithm for computing only a part of Fourier coefficients. PFT approximates a part of twiddle factors (trigonometric constants) using polynomials, thereby reducing the computational complexity due to the mixture of many twiddle factors. We derive the asymptotic time complexity of PFT with respect to input and output sizes, as well as its numerical accuracy. Experimental results show that PFT outperforms the current state-of-the-art algorithms, with an order of magnitude of speedup for sufficiently small output sizes without sacrificing accuracy.",
+    "title": "Fast Partial Fourier Transform",
+    "authors": [
+      "Yong-chan Park",
+      "Jun-Gi Jang",
+      "U Kang"
+    ],
+    "emails": [
+      "~Jun-Gi_Jang1",
+      "~U_Kang1",
+      "~Yong-chan_Park1"
+    ],
+    "rank": 1397
+  },
+  {
+    "url": "https://openreview.net/forum?id=0IOX0YcCdTn",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      4
+    ],
+    "rating": "5.42",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "Given a simple request like Put a washed apple in the kitchen fridge, humans can reason in purely abstract terms by imagining action sequences and scoring their likelihood of success, prototypicality, and efficiency, all without moving a muscle. Once we see the kitchen in question, we can update our abstract plans to fit the scene. Embodied agents require the same abilities, but existing work does not yet provide the infrastructure necessary for both reasoning abstractly and executing concretely. We address this limitation by introducing ALFWorld, a simulator that enables agents to learn abstract, text-based policies in TextWorld (C\u00f4t\u00e9 et al., 2018) and then execute goals from the ALFRED benchmark (Shridhar et al., 2020) in a rich visual environment. ALFWorld enables the creation of a new BUTLER agent whose abstract knowledge, learned in TextWorld, corresponds directly to concrete, visually grounded actions. In turn, as we demonstrate empirically, this fosters better agent generalization than training only in the visually grounded environment. BUTLER\u2019s simple, modular design factors the problem to allow researchers to focus on models for improving every piece of the pipeline (language understanding, planning, navigation, and visual scene understanding).",
+    "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
+    "authors": [
+      "Mohit Shridhar",
+      "Xingdi Yuan",
+      "Marc-Alexandre Cote",
+      "Yonatan Bisk",
+      "Adam Trischler",
+      "Matthew Hausknecht"
+    ],
+    "emails": [
+      "~Adam_Trischler1",
+      "~Xingdi_Yuan2",
+      "~Marc-Alexandre_Cote1",
+      "~Mohit_Shridhar1",
+      "~Matthew_Hausknecht1",
+      "~Yonatan_Bisk1"
+    ],
+    "rank": 1398
+  },
+  {
+    "url": "https://openreview.net/forum?id=X7iHv744p5Y",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.42",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Pretrained Language Models (LMs) generate text with remarkable quality, novelty, and coherence while semantically conditioning on context. Yet applying LMs to the seemingly simpler problems of paraphrasing and infilling currently requires supervision, since these tasks break the left-to-right generation setup of pretrained LMs. We present Reflective Decoding, a novel unsupervised approach to apply the capabilities of pretrained LMs to non-sequential tasks. Our approach is general and applicable to two distant tasks -- paraphrasing and abductive reasoning. It requires no supervision or parallel corpora, only two pretrained language models: forward and backward. Reflective Decoding operates in two intuitive steps. In the contextualization step, we use LMs to generate many left and right contexts which collectively capture the meaning of the input sentence. Then, in the reflection step we decode in the semantic neighborhood of the input, conditioning on an ensemble of generated contexts with the reverse direction LM. We reflect through the generated contexts, effectively using them as an intermediate meaning representation to generate conditional output. Empirical results demonstrate that Reflective Decoding outperforms strong unsupervised baselines on both paraphrasing and abductive text infilling, significantly narrowing the gap between unsupervised and supervised methods. Reflective Decoding introduces the concept of using generated contexts to represent meaning, opening up new possibilities for unsupervised conditional text generation. ",
+    "title": "Reflective Decoding: Unsupervised Paraphrasing and Abductive Reasoning",
+    "authors": [
+      "Peter West",
+      "Ximing Lu",
+      "Ari Holtzman",
+      "Chandra Bhagavatula",
+      "Jena Hwang",
+      "Yejin Choi"
+    ],
+    "emails": [
+      "~Chandra_Bhagavatula1",
+      "allenai.org",
+      "~Ari_Holtzman1",
+      "~Peter_West1",
+      "~Yejin_Choi1"
+    ],
+    "rank": 1399
+  },
+  {
+    "url": "https://openreview.net/forum?id=4YzI0KpRQtZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.42",
+    "confidences": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Despite the success of existing tensor factorization methods, most of them conduct a multilinear decomposition, and rarely exploit powerful modeling frameworks, like deep neural networks, to capture a variety of complicated interactions in data. More important, for highly expressive, deep factorization, we lack an effective approach to handle streaming data, which are ubiquitous in real-world applications. To address these issues, we propose SPIDER, a Streaming ProbabilistIc Deep tEnsoR factorization method. We first use Bayesian neural networks (NNs) to construct a deep tensor factorization model. We assign a spike-and-slab prior over the NN weights to encourage sparsity and prevent overfitting. We then use Taylor expansions and moment matching to approximate the posterior of the NN output and calculate the running model evidence, based on which we develop an efficient streaming posterior inference algorithm in the assumed-density-filtering and expectation propagation framework. Our algorithm provides responsive incremental updates for the posterior of the latent factors and NN weights upon receiving new tensor entries, and meanwhile select and inhibit redundant/useless weights. We show the advantages of our approach in four real-world applications.",
+    "title": "Streaming Probabilistic Deep Tensor Factorization",
+    "authors": [
+      "shikai fang",
+      "Zheng Wang",
+      "Zhimeng pan",
+      "Ji Liu",
+      "Shandian Zhe"
+    ],
+    "emails": [
+      "utah.edu",
+      "~Shandian_Zhe1",
+      "~Zheng_Wang2",
+      "~shikai_fang1",
+      "~Ji_Liu1"
+    ],
+    "rank": 1400
+  },
+  {
+    "url": "https://openreview.net/forum?id=zLWGnikc_wi",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.42",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Adversarial examples have shown that albeit highly accurate, models learned by machines, differently from humans, have many weaknesses. However, humans\u2019 perception is also fundamentally different from machines, because we do not see the signals which arrive at the retina but a rather complex recreation of them. In this paper, we explore how machines could recreate the input as well as investigate the benefits of such an augmented perception. In this regard, we propose Perceptual Deep Neural Networks (\u03d5DNN) which also recreate their own input before further processing. The concept is formalized mathematically and two variations of it are developed (one based on inpainting the whole image and the other based on a noisy resized super resolution recreation). Experiments reveal that \u03d5DNNs and their adversarial training variations can increase the robustness substantially, surpassing both state-of-the-art defenses and pre-processing types of defenses in 100% of the tests. \u03d5DNNs are shown to scale well to bigger image sizes, keeping a similar high accuracy throughout; while the state-of-the-art worsen up to 35%. Moreover, the recreation process intentionally corrupts the input image. Interestingly, we show by ablation tests that corrupting the input is, although counterintuitive, beneficial. Thus, \u03d5DNNs reveal that input recreation has strong benefits for artificial neural networks similar to biological ones, shedding light into the importance of purposely corrupting the input as well as pioneering an area of perception models based on GANs and autoencoders for robust recognition in artificial intelligence.",
+    "title": "Perceptual Deep Neural Networks: Adversarial Robustness Through Input Recreation",
+    "authors": [
+      "Danilo Vasconcellos Vargas",
+      "Bingli Liao",
+      "Takahiro Kanzaki"
+    ],
+    "emails": [
+      "~Danilo_Vasconcellos_Vargas1",
+      "s.kyushu-u.ac.jp"
+    ],
+    "rank": 1401
+  },
+  {
+    "url": "https://openreview.net/forum?id=G67PtYbCImX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      8
+    ],
+    "rating": "5.42",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "Many active learning and search approaches are intractable for industrial settings with billions of unlabeled examples. Existing approaches, such as uncertainty sampling or information density, search globally for the optimal examples to label, scaling linearly or even quadratically with the unlabeled data. However, in practice, data is often heavily skewed; only a small fraction of collected data will be relevant for a given learning task. For example, when identifying rare classes, detecting malicious content, or debugging model performance, positive examples can appear in less than 1% of the data. In this work, we exploit this skew in large training datasets to reduce the number of unlabeled examples considered in each selection round by only looking at the nearest neighbors to the labeled examples. Empirically, we observe that learned representations can effectively cluster unseen concepts, making active learning very effective and substantially reducing the number of viable unlabeled examples. We evaluate several selection strategies in this setting on three large-scale computer vision datasets: ImageNet, OpenImages, and a proprietary dataset of 10 billion images from a large internet company. For rare classes, active learning methods need as little as 0.31% of the labeled data to match the average precision of full supervision. By limiting the selection strategies to the immediate neighbors of the labeled data as candidates for labeling, we process as little as 0.1% of the unlabeled data while achieving similar reductions in labeling costs as the traditional global approach. This process of expanding the candidate pool with the nearest neighbors of the labeled set can be done efficiently and reduces the computational complexity of selection by orders of magnitude. ",
+    "title": "Similarity Search for Efficient Active Learning and Search of Rare Concepts",
+    "authors": [
+      "Cody Coleman",
+      "Edward Chou",
+      "Sean Culatana",
+      "Peter Bailis",
+      "Alexander C. Berg",
+      "Roshan Sumbaly",
+      "Matei Zaharia",
+      "I. Zeki Yalniz"
+    ],
+    "emails": [
+      "~Matei_Zaharia1",
+      "~Alexander_C._Berg1",
+      "gmail.com",
+      "~I._Zeki_Yalniz1",
+      "~Peter_Bailis1",
+      "fb.com",
+      "~Cody_Coleman1"
+    ],
+    "rank": 1402
+  },
+  {
+    "url": "https://openreview.net/forum?id=aD1_5zowqV",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      4,
+      5,
+      7
+    ],
+    "rating": "5.41",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Energy-based models (EBMs) parametrized by neural networks can be trained by Markov chain Monte Carlo (MCMC) sampling-based maximum likelihood estimation. Despite the recent significant success of EBMs in data generation, the current approaches to train EBMs can be unstable and sometimes may have difficulty synthesizing diverse and high-fidelity images. In this paper, we propose to train EBMs via a multistage coarse-to-fine expanding and sampling strategy, namely CF-EBM. To improve the learning procedure, we propose an effective net architecture and advocate applying smooth activations. The resulting approach is computationally efficient and achieves the best performance on image generation amongst EBMs and the spectral normalization GAN. Furthermore, we provide a recipe for being the first successful EBM to synthesize <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>512</mn><mo>\u00d7</mo><mn>512</mn></math></mjx-assistive-mml></mjx-container>-pixel images and also improve out-of-distribution detection. In the end, we effortlessly generalize CF-EBM to the one-sided unsupervised image-to-image translation and beat baseline methods with the model size and the training budget largely reduced. In parallel, we present a gradient-based discriminative saliency method to interpret the translation dynamics which align with human behavior explicitly.",
+    "title": "Learning Energy-Based Generative Models via Coarse-to-Fine Expanding and Sampling",
+    "authors": [
+      "Yang Zhao",
+      "Jianwen Xie",
+      "Ping Li"
+    ],
+    "emails": [
+      "~Yang_Zhao5",
+      "~Ping_Li3",
+      "~Jianwen_Xie1"
+    ],
+    "rank": 1403
+  },
+  {
+    "url": "https://openreview.net/forum?id=6KZ_kUVCfTa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.41",
+    "confidences": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "High-dimensional observations are a major challenge in the application of model-based reinforcement learning (MBRL) to real-world environments. In order to handle high-dimensional sensory inputs, existing MBRL approaches use representation learning to map high-dimensional observations into a lower-dimensional latent space that is more amenable to dynamics estimation and planning. Crucially, the task-relevance and predictability of the learned representations play critical roles in the success of planning in latent space. In this work, we present Non-Markovian Predictive Coding (NMPC), an information-theoretic approach for planning from high-dimensional observations with two key properties: 1) it formulates a mutual information objective that prioritizes the encoding of task-relevant components of the environment; and 2) it employs a recurrent neural network capable of modeling non-Markovian latent dynamics. To demonstrate NMPC\u2019s ability to prioritize task-relevant information, we evaluate our new model on a challenging modification of standard DMControl tasks where the DMControl background is replaced with natural videos, containing complex but irrelevant information to the planning task. Our experiments show that NMPC is superior to existing methods in the challenging complex-background setting while remaining competitive with current state-of-the-art MBRL models in the standard setting.",
+    "title": "Non-Markovian Predictive Coding For Planning In Latent Space",
+    "authors": [
+      "Tung Nguyen",
+      "Rui Shu",
+      "Tuan Pham",
+      "Hung Bui",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Tung_Nguyen2",
+      "vinai.io",
+      "~Rui_Shu1",
+      "~Stefano_Ermon1",
+      "~Hung_Bui1"
+    ],
+    "rank": 1404
+  },
+  {
+    "url": "https://openreview.net/forum?id=i3Ui1Csrqpm",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      4
+    ],
+    "rating": "5.41",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Transformer models have garnered a lot of interest in recent years by delivering state-of-the-art performance in a range of Natural Language Processing (NLP) tasks. However, these models can have over a hundred billion parameters, presenting very high computational and memory requirements. We address this challenge through Approximate Computing, specifically targeting the use of Transformers in NLP tasks. Transformers are typically pre-trained and subsequently specialized for specific tasks through transfer learning. We observe that pre-trained Transformers are often over-parameterized for several downstream NLP tasks and propose a framework to create smaller and faster models with comparable accuracy. The key cornerstones of the framework are a Significance Analysis (SA) method to identify important components in a pre-trained Transformer for a given task, and techniques to approximate the less significant components. Our framework can be adapted to produce models that are faster, smaller and/or more accurate, depending on the user's constraints. We apply our framework to multiple Transformer models and different downstream tasks, including previously proposed optimized models like DistilBERT and Q8BERT. We demonstrate that our framework produces models that are up to 4<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> faster and up to 14<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> smaller (with less than 0.5% relative accuracy degradation), or up to 5.5% more accurate with simultaneous model size and speed improvements of up to 9.8<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> and 2.9<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>, respectively.",
+    "title": "Optimizing Transformers with Approximate Computing for Faster, Smaller and more Accurate NLP Models",
+    "authors": [
+      "Amrit Nagarajan",
+      "Sanchari Sen",
+      "Jacob R. Stevens",
+      "Anand Raghunathan"
+    ],
+    "emails": [
+      "~Sanchari_Sen1",
+      "~Jacob_R._Stevens1",
+      "~Amrit_Nagarajan1",
+      "~Anand_Raghunathan1"
+    ],
+    "rank": 1405
+  },
+  {
+    "url": "https://openreview.net/forum?id=--rcOeCKRh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.41",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "After learning a new object category from image-level annotations (with no object bounding boxes), humans are remarkably good at precisely localizing those objects. However, building good object localizers (i.e., detectors) currently requires expensive instance-level annotations. While some work has been done on learning detectors from weakly labeled samples (with only class labels), these detectors do poorly at localization. In this work, we show how to build better object detectors from weakly labeled images of new categories by leveraging knowledge learned from fully labeled base categories. We call this learning paradigm cross-supervised object detection. While earlier works investigated this paradigm, they did not apply it to realistic complex images (e.g., COCO), and their performance was poor. We propose a unified framework that combines a detection head trained from instance-level annotations and a recognition head learned from image-level annotations, together with a spatial correlation module that bridges the gap between detection and recognition. These contributions enable us to better detect novel objects with image-level annotations in complex multi-object scenes such as the COCO dataset.",
+    "title": "CROSS-SUPERVISED OBJECT DETECTION",
+    "authors": [
+      "Zitian Chen",
+      "Zhiqiang Shen",
+      "Jiahui Yu",
+      "Erik Learned-Miller"
+    ],
+    "emails": [
+      "~Zhiqiang_Shen1",
+      "~Jiahui_Yu1",
+      "~Erik_Learned-Miller2",
+      "~Zitian_Chen1"
+    ],
+    "rank": 1406
+  },
+  {
+    "url": "https://openreview.net/forum?id=IeuEO1TccZn",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.41",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a novel approach to representation learning called sufficient and disentangled representation learning (SDRL). With SDRL, we seek a data representation that maps the input data to a lower-dimensional space with two properties: sufficiency and disentanglement. First, the representation is sufficient in the sense that the original input data is conditionally independent of the response or label given the representation. Second, the representation is maximally disentangled with mutually independent components and rotation invariant in distribution. We show that such a representation always exists under mild conditions on the input data distribution based on optimal transport theory. We formulate an objective function characterizing conditional independence and disentanglement. This objective function is then used to train a sufficient and disentangled representation with deep neural networks. We provide strong statistical guarantees for the learned representation by establishing an upper bound on the excess error of the objective function and show that it reaches the nonparametric minimax rate under mild conditions. We also validate the proposed method via numerical experiments and real data analysis.",
+    "title": "Sufficient and Disentangled Representation Learning",
+    "authors": [
+      "Jian Huang",
+      "Yuling Jiao",
+      "Xu Liao",
+      "Jin Liu",
+      "Zhou Yu"
+    ],
+    "emails": [
+      "stat.ecnu.edu.cn",
+      "duke-nus.edu.sg",
+      "u.duke.nus.edu",
+      "~Jian_Huang5",
+      "whu.edu.cn"
+    ],
+    "rank": 1407
+  },
+  {
+    "url": "https://openreview.net/forum?id=muppfCkU9H1",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.41",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Self-attention mechanism in graph neural networks (GNNs) led to state-of-the-art performance on many graph representation learning task. Currently, at every layer, attention is computed between connected pairs of nodes and depends solely on the representation of the two nodes. However, such attention mechanism does not account for nodes that are not directly connected but provide important network context, which could lead to improved predictive performance. Here we propose Multi-hop Attention Graph Neural Network (MAGNA), a principled way to incorporate multi-hop context information into attention computation, enabling long-range interactions at every layer of the GNN.  To compute attention between nodes that are not directly connected,\nMAGNA diffuses the attention scores across the network, which increases the ''receptive field'' for every layer of the GNN.\nUnlike previous approaches, MAGNA uses a diffusion prior on attention values, to efficiently account for all paths between the pair of disconnected nodes. This helps MAGNA capture large-scale structural information in every layer, and learn more informative attention. Experimental results on node classification as well as the knowledge graph completion benchmarks show that MAGNA achieves state-of-the-art results: MAGNA achieves up to 5.7% relative error reduction over the previous state-of-the-art on Cora, Citeseer, and Pubmed. MAGNA also obtains the best performance on a large-scale Open Graph Benchmark dataset. On knowledge graph completion  MAGNA advances state-of-the-art on WN18RR and FB15k-237 across four different performance metrics.",
+    "title": "Multi-hop Attention Graph Neural Network",
+    "authors": [
+      "Guangtao Wang",
+      "Zhitao Ying",
+      "Jing Huang",
+      "Jure Leskovec"
+    ],
+    "emails": [
+      "~Jing_Huang3",
+      "~Guangtao_Wang1",
+      "~Zhitao_Ying1",
+      "~Jure_Leskovec1"
+    ],
+    "rank": 1408
+  },
+  {
+    "url": "https://openreview.net/forum?id=H38f_9b90BO",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.41",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Massive labeled data have been used in training deep neural networks, thus label noise has become an important issue therein.  Although learning with noisy labels has made great progress on image datasets in recent years, it has not yet been studied in connection with utilizing GNNs to classify graph nodes. In this paper, we proposed a method, named LPM,  to address the problem using Label Propagation (LP) and Meta learning.  Different from previous methods designed for image datasets, our method is based on a special attribute (label smoothness) of graph-structured data, i.e., neighboring nodes in a graph tend to have the same label.  A pseudo label is computed from the neighboring labels for each node in the training set using LP; meta learning is utilized to learn a proper aggregation of the original and pseudo label as the final label.  Experimental results demonstrate that LPM outperforms state-of-the-art methods in graph node classification task with both synthetic and real-world label noise. Source code to reproduce all results will be released.",
+    "title": "Towards Robust Graph Neural Networks against Label Noise",
+    "authors": [
+      "Jun Xia",
+      "Haitao Lin",
+      "Yongjie Xu",
+      "Lirong Wu",
+      "Zhangyang Gao",
+      "Siyuan Li",
+      "Stan Z. Li"
+    ],
+    "emails": [
+      "~Lirong_Wu1",
+      "~Stan_Z._Li2",
+      "~Jun_Xia1",
+      "~Haitao_Lin2",
+      "~Zhangyang_Gao1",
+      "~Siyuan_Li6",
+      "~Yongjie_Xu1"
+    ],
+    "rank": 1409
+  },
+  {
+    "url": "https://openreview.net/forum?id=S5S3eTEmouw",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.40",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "This paper introduces the offline meta-reinforcement learning (offline meta-RL) problem setting and proposes an algorithm that performs well in this setting. Offline meta-RL is analogous to the widely successful supervised learning strategy of pre-training a model on a large batch of fixed, pre-collected data (possibly from various tasks) and fine-tuning the model to a new task with relatively little data. That is, in offline meta-RL, we meta-train on fixed, pre-collected data from several tasks and adapt to a new task with a very small amount (less than 5 trajectories) of data from the new task. By nature of being offline, algorithms for offline meta-RL can utilize the largest possible pool of training data available and eliminate potentially unsafe or costly data collection during meta-training. This setting inherits the challenges of offline RL, but it differs significantly because offline RL does not generally consider a) transfer to new tasks or b) limited data from the test task, both of which we face in offline meta-RL. Targeting the offline meta-RL setting, we propose Meta-Actor Critic with Advantage Weighting (MACAW). MACAW is an optimization-based meta-learning algorithm that uses simple, supervised regression objectives for both the inner and outer loop of meta-training. On offline variants of common meta-RL benchmarks, we empirically find that this approach enables fully offline meta-reinforcement learning and achieves notable gains over prior methods.",
+    "title": "Offline Meta-Reinforcement Learning with Advantage Weighting",
+    "authors": [
+      "Eric Mitchell",
+      "Rafael Rafailov",
+      "Xue Bin Peng",
+      "Sergey Levine",
+      "Chelsea Finn"
+    ],
+    "emails": [
+      "~Xue_Bin_Peng1",
+      "stanford.edu",
+      "~Chelsea_Finn1",
+      "~Eric_Mitchell1",
+      "~Sergey_Levine1"
+    ],
+    "rank": 1410
+  },
+  {
+    "url": "https://openreview.net/forum?id=QpT9Q_NNfQL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      7,
+      4
+    ],
+    "rating": "5.40",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Whittle index policy is a powerful tool to obtain asymptotically optimal solutions for the notoriously intractable problem of restless bandits. However, finding the Whittle indices remains a difficult problem for many practical restless bandits with convoluted transition kernels. This paper proposes NeurWIN, a neural Whittle index network that seeks to learn the Whittle indices for any restless bandits by leveraging mathematical properties of the Whittle indices. We show that a neural network that produces the Whittle index is also one that produces the optimal control for a set of Markov decision problems. This property motivates using deep reinforcement learning for the training of NeurWIN. We demonstrate the utility of NeurWIN by evaluating its performance for three recently studied restless bandit problems. Our experiment results show that the performance of NeurWIN is either better than, or as good as, state-of-the-art policies for all three problems.",
+    "title": "NeurWIN: Neural Whittle Index Network for Restless Bandits via Deep RL",
+    "authors": [
+      "Khaled Nakhleh",
+      "Santosh Ganji",
+      "Ping-Chun Hsieh",
+      "I-Hong Hou",
+      "Srinivas Shakkottai"
+    ],
+    "emails": [
+      "~I-Hong_Hou1",
+      "~Santosh_Ganji1",
+      "~Srinivas_Shakkottai1",
+      "~Khaled_Nakhleh1",
+      "~Ping-Chun_Hsieh1"
+    ],
+    "rank": 1411
+  },
+  {
+    "url": "https://openreview.net/forum?id=QpU7n-6l0n",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.40",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Data augmentation is one of the most popular techniques for improving the robustness of neural networks. In addition to directly training the model with original samples and augmented samples, a torrent of methods regularizing the distance between embeddings/representations of the original samples and their augmented counterparts have been introduced. In this paper, we explore these various regularization choices, seeking to provide a general understanding of how we should regularize the embeddings. Our analysis suggests how the ideal choices of regularization correspond to various assumptions. With an invariance test, we show that regularization is important if the model is to be used in a broader context than the in-lab setting because non-regularized approaches are limited in learning the concept of invariance, despite equally high accuracy. Finally, we also show that the generic approach we identified (squared <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> norm regularized augmentation) performs better than several recent methods, which are each specially designed for one task and significantly more complicated than ours, over three different tasks.",
+    "title": "On the Consistency Loss for Leveraging Augmented Data to Learn Robust and Invariant Representations",
+    "authors": [
+      "Haohan Wang",
+      "Zeyi Huang",
+      "Xindi Wu",
+      "Eric Xing"
+    ],
+    "emails": [
+      "~Xindi_Wu1",
+      "~Eric_Xing1",
+      "~Haohan_Wang1",
+      "~Zeyi_Huang3"
+    ],
+    "rank": 1412
+  },
+  {
+    "url": "https://openreview.net/forum?id=KwgQn_Aws3_",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.40",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "We propose a novel interpretable recurrent neural network (RNN) model, called ProtoryNet, in which we introduce a new concept of prototype trajectories. Motivated by the prototype theory in modern linguistics, ProtoryNet makes a prediction by finding the most similar prototype for each sentence in a text sequence and feeding an RNN backbone with the proximity of each of the sentences to the prototypes. The RNN backbone then captures the temporal pattern of the prototypes, to which we refer as prototype trajectories. The prototype trajectories enable intuitive, fine-grained interpretation of how the model reached to the final prediction, resembling the process of how humans analyze paragraphs. Experiments conducted on multiple public data sets reveal that the proposed method not only is more interpretable but also is more accurate than the current state-of-the-art prototype-based method. Furthermore, we report a survey result indicating that human users find ProtoryNet more intuitive and easier to understand, compared to the other prototype-based methods.",
+    "title": "Interpretable Sequence Classification Via Prototype Trajectory",
+    "authors": [
+      "Dat Hong",
+      "Stephen Baek",
+      "Tong Wang"
+    ],
+    "emails": [
+      "uiowa.edu",
+      "~Stephen_Baek1",
+      "~Tong_Wang4"
+    ],
+    "rank": 1413
+  },
+  {
+    "url": "https://openreview.net/forum?id=J3OUycKwz-",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      3
+    ],
+    "rating": "5.40",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In the human brain, sequences of language input are processed within a distributed and hierarchical architecture, in which higher stages of processing encode contextual information over longer timescales. In contrast, in recurrent neural networks which perform natural language processing, we know little about how the multiple timescales of contextual information are functionally organized. Therefore, we applied tools developed in neuroscience to map the \u201cprocessing timescales\u201d of individual units within a word-level LSTM language model. This timescale-mapping method assigned long timescales to units previously found to track long-range syntactic dependencies. Additionally, the mapping revealed a small subset of the network (less than 15% of units) with long timescales and whose function had not previously been explored. We next probed the functional organization of the network by examining the relationship between the processing timescale of units and their network connectivity. We identified two classes of long-timescale units: \u201ccontroller\u201d units composed a densely interconnected subnetwork and strongly projected to the rest of the network, while \u201cintegrator\u201d units showed the longest timescales in the network, and expressed projection profiles closer to the mean projection profile. Ablating integrator and controller units affected model performance at different positions within a sentence, suggesting distinctive functions of these two sets of units. Finally, we tested the generalization of these results to a character-level LSTM model and models with different architectures. In summary, we demonstrated a model-free technique for mapping the timescale organization in recurrent neural networks, and we applied this method to reveal the timescale and functional organization of neural language models",
+    "title": "Mapping the Timescale Organization of Neural Language Models",
+    "authors": [
+      "Hsiang-Yun Sherry Chien",
+      "Jinhan Zhang",
+      "Christopher Honey"
+    ],
+    "emails": [
+      "~Hsiang-Yun_Sherry_Chien1",
+      "~Jinhan_Zhang1",
+      "~Christopher_Honey1"
+    ],
+    "rank": 1414
+  },
+  {
+    "url": "https://openreview.net/forum?id=dak8uQE6BOG",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.40",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Conditional Generative Adversarial Nets (cGANs) have been widely adopted for image generation. cGANs take  i) a noise vector and ii) a conditional variable as input. The conditional variable can be discrete (e.g., a class label) or continuous (e.g., an input image) resulting into class-conditional (image) generation and image-to-image translation models, respectively. However, depending on whether the conditional variable is discrete or continuous, various cGANs employ substantially different deep architectures and loss functions for their training. In this paper, we propose a novel framework, called MVP, for conditional data generation. MVP resorts to multivariate polynomials of higher-order and treats in a unified way both discrete and continuous conditional variables. MVP is highly expressive, capturing higher-order auto- and cross-correlations of input variables (noise vector and conditional variable). Tailored sharing schemes are designed between the polynomial\u2019s parameter tensors, which result in simple recursive formulas. MVP can synthesize realistic images in both class-conditional and image-to-image translation tasks even in the absence of activation functions between the layers. ",
+    "title": "MVP: Multivariate polynomials for conditional generation",
+    "authors": [
+      "Grigorios Chrysos",
+      "Yannis Panagakis"
+    ],
+    "emails": [
+      "~Yannis_Panagakis1",
+      "~Grigorios_Chrysos1"
+    ],
+    "rank": 1415
+  },
+  {
+    "url": "https://openreview.net/forum?id=TJzkxFw-mGm",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      4,
+      7
+    ],
+    "rating": "5.40",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We consider model-free reinforcement learning (RL) in non-stationary Markov decision processes (MDPs). Both the reward functions and the state transition distributions are allowed to vary over time, either gradually or abruptly, as long as their cumulative variation magnitude does not exceed certain budgets. We propose an algorithm, named Restarted Q-Learning with Upper Confidence Bounds (RestartQ-UCB), for this setting, which adopts a simple restarting strategy and an extra optimism term. Our algorithm outperforms the state-of-the-art (model-based) solution in terms of dynamic regret. Specifically, RestartQ-UCB with Freedman-type bonus terms achieves a dynamic regret of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.187em; margin-bottom: -0.597em;\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2DC TEX-S1\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c394\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em; margin-left: 0.056em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo>~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>S</mi><mrow><mfrac><mn>1</mn><mn>3</mn></mfrac></mrow></msup><msup><mi>A</mi><mrow><mfrac><mn>1</mn><mn>3</mn></mfrac></mrow></msup><msup><mi mathvariant=\"normal\">\u0394</mi><mrow><mfrac><mn>1</mn><mn>3</mn></mfrac></mrow></msup><mi>H</mi><msup><mi>T</mi><mrow><mfrac><mn>2</mn><mn>3</mn></mfrac></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>S</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>A</mi></math></mjx-assistive-mml></mjx-container> are the numbers of states and actions, respectively, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c394\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u0394</mi><mo>&gt;</mo><mn>0</mn></math></mjx-assistive-mml></mjx-container> is the variation budget, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> is the number of steps per episode, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the total number of steps. We further show that our algorithm is near-optimal by establishing an information-theoretical lower bound of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em; margin-left: 0.052em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c394\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em; margin-left: 0.053em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.375em; margin-left: 0.056em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><msup><mi>S</mi><mrow><mfrac><mn>1</mn><mn>3</mn></mfrac></mrow></msup><msup><mi>A</mi><mrow><mfrac><mn>1</mn><mn>3</mn></mfrac></mrow></msup><msup><mi mathvariant=\"normal\">\u0394</mi><mrow><mfrac><mn>1</mn><mn>3</mn></mfrac></mrow></msup><msup><mi>H</mi><mrow><mfrac><mn>2</mn><mn>3</mn></mfrac></mrow></msup><msup><mi>T</mi><mrow><mfrac><mn>2</mn><mn>3</mn></mfrac></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, which to the best of our knowledge is the first impossibility result in non-stationary RL in general.",
+    "title": "Near-Optimal Regret Bounds for Model-Free RL in Non-Stationary Episodic MDPs",
+    "authors": [
+      "Weichao Mao",
+      "Kaiqing Zhang",
+      "Ruihao Zhu",
+      "David Simchi-Levi",
+      "Tamer Basar"
+    ],
+    "emails": [
+      "~Weichao_Mao1",
+      "~Ruihao_Zhu2",
+      "mit.edu",
+      "~Tamer_Basar1",
+      "~Kaiqing_Zhang3"
+    ],
+    "rank": 1416
+  },
+  {
+    "url": "https://openreview.net/forum?id=IVwXaHpiO0",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      9,
+      4,
+      7
+    ],
+    "rating": "5.40",
+    "confidences": [
+      4,
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Estimating causal treatment effects using observational data is a problem with few solutions when the confounder has a temporal structure, e.g. the history of disease progression might impact both treatment decisions and clinical outcomes. For such a challenging problem, it is desirable for the method to be transparent --- the ability to pinpoint a small subset of data points that contributes most to the estimate and to clearly indicate whether the estimate is reliable or not. This paper develops a new method, SyncTwin, to overcome temporal confounding in a transparent way. SyncTwin estimates the treatment effect of a target individual by comparing the outcome with its synthetic twin, which is constructed to closely match the target in the representation of the temporal confounders. SyncTwin achieves transparency by enforcing the synthetic twin to only depend on the weighted combination of few other individuals in the dataset. Moreover, the quality of the synthetic twin can be assessed by a performance metric, which also indicates the reliability of the estimated treatment effect. Experiments demonstrate that SyncTwin outperforms the benchmarks in clinical observational studies while still being transparent.",
+    "title": "SyncTwin: Transparent Treatment Effect Estimation under Temporal Confounding",
+    "authors": [
+      "Zhaozhi Qian",
+      "Yao Zhang",
+      "Ioana Bica",
+      "Angela Wood",
+      "Mihaela van der Schaar"
+    ],
+    "emails": [
+      "~Ioana_Bica1",
+      "~Mihaela_van_der_Schaar2",
+      "~Zhaozhi_Qian1",
+      "~Yao_Zhang3",
+      "medschl.cam.ac.uk"
+    ],
+    "rank": 1417
+  },
+  {
+    "url": "https://openreview.net/forum?id=8CCwiOHx_17",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      7
+    ],
+    "rating": "5.40",
+    "confidences": [
+      1,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Learning to autonomously navigate the web is a difficult sequential decision making task. The state and action spaces are large and combinatorial in nature, and successful navigation may require traversing several partially-observed pages. One of the bottlenecks of training web navigation agents is providing a learnable curriculum of training environments that can cover the large variety of real-world websites. Therefore, we propose using Adversarial Environment Generation (AEG) to generate challenging web environments in which to train reinforcement learning (RL) agents. We introduce a new benchmarking environment, gMiniWoB, which enables an RL adversary to use compositional primitives to learn to generate complex websites. To train the adversary, we present a new decoder-like architecture that can directly control the difficulty of the environment, and a new training technique Flexible b-PAIRED. Flexible b-PAIRED jointly trains the adversary and a population of navigator agents and incentivizes the adversary to generate \u201djust-the-right-challenge\u201d environments by simultaneously learning two policies encoded in the adversary\u2019s architecture. First, for its environment complexity choice (difficulty budget), the adversary is rewarded with the performance of the best-performing agent in the population. Second, for selecting the design elements the adversary learns to maximize the regret using the difference in capabilities of navigator agents in population (flexible regret). The results show that the navigator agent trained with Flexible b-PAIRED generalizes to new environments, significantly outperforms competitive automatic curriculum generation baselines\u2014including a state-of-the-art RL web navigation approach and prior methods for minimax regret AEG\u2014on a set of challenging unseen test environments that are order of magnitude more complex than the previous benchmarks. The navigator agent achieves more than 75% success rate on all tasks, yielding 4x higher success rate that the strongest baseline.",
+    "title": "Adversarial Environment Generation for Learning  to Navigate the Web",
+    "authors": [
+      "Izzeddin Gur",
+      "Natasha Jaques",
+      "Kevin Malta",
+      "Manoj Tiwari",
+      "Honglak Lee",
+      "Aleksandra Faust"
+    ],
+    "emails": [
+      "~Izzeddin_Gur1",
+      "google.com",
+      "~Aleksandra_Faust1",
+      "~Honglak_Lee2",
+      "~Natasha_Jaques1"
+    ],
+    "rank": 1418
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZVqZIA1GA_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6
+    ],
+    "rating": "5.40",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Capsule networks promise significant benefits over convolutional networks by storing stronger internal representations, and routing information based on the agreement between intermediate representations' projections. Despite this, their success has been mostly limited to small-scale classification datasets due to their computationally expensive nature. Recent studies have partially overcome this burden by locally-constraining the dynamic routing of features with convolutional capsules. Though memory efficient, convolutional capsules impose geometric constraints which fundamentally limit the ability of capsules to model the pose/deformation of objects. Further, they do not address the bigger memory concern of class-capsules scaling-up to bigger tasks such as detection or large-scale classification. In this study, we introduce deformable capsules (DeformCaps), a new capsule structure (SplitCaps), and a novel dynamic routing algorithm (SE-Routing) to balance computational efficiency with the need for modeling a large number of objects and classes. We demonstrate that the proposed methods allow capsules to efficiently scale-up to large-scale computer vision tasks for the first time, and create the first-ever capsule network for object detection in the literature. Our proposed architecture is a one-stage detection framework and obtains results on MS COCO which are on-par with state-of-the-art one-stage CNN-based methods, while producing fewer false positive detections.",
+    "title": "Deformable Capsules for Object Detection",
+    "authors": [
+      "Rodney LaLonde",
+      "Naji Khosravan",
+      "Ulas Bagci"
+    ],
+    "emails": [
+      "~Rodney_LaLonde1",
+      "~Naji_Khosravan1",
+      "~Ulas_Bagci1"
+    ],
+    "rank": 1419
+  },
+  {
+    "url": "https://openreview.net/forum?id=jpm1AfJucwt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      5,
+      7
+    ],
+    "rating": "5.39",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "By removing parameters from deep neural networks, unstructured pruning methods aim at cutting down memory footprint and computational cost, while maintaining prediction accuracy. In order to tackle this otherwise intractable problem, many of these methods model the loss landscape using first or second order Taylor expansions to identify which parameters can be discarded. We revisit loss modelling for unstructured pruning: we show the importance of ensuring locality of the pruning steps, and systematically compare first and second order Taylor expansions. Finally, we show that better preserving the original network function does not necessarily transfer to better performing networks after fine-tuning, suggesting that only considering the impact of pruning on the loss might not be a sufficient objective to design good pruning criteria.",
+    "title": "Revisiting Loss Modelling for Unstructured Pruning",
+    "authors": [
+      "C\u00e9sar Laurent",
+      "Camille Ballas",
+      "Thomas George",
+      "Pascal Vincent",
+      "Nicolas Ballas"
+    ],
+    "emails": [
+      "~Nicolas_Ballas1",
+      "~C\u00e9sar_Laurent1",
+      "~Pascal_Vincent1",
+      "~Thomas_George2",
+      "~Camille_Ballas1"
+    ],
+    "rank": 1420
+  },
+  {
+    "url": "https://openreview.net/forum?id=xJFxgRLx79J",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      6,
+      3
+    ],
+    "rating": "5.39",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We propose a surprisingly simple but effective two-time-scale (2TS) model for learning user representations for recommendation. In our approach, we will partition users into two sets, active users with many observed interactions and inactive or new users with few observed interactions, and we will use two RNNs to model them separately. Furthermore, we design a two-stage training method for our model, where, in the first stage, we learn transductive embeddings for users and items, and then, in the second stage, we learn the two RNNs leveraging the transductive embeddings trained in the first stage. Through the lens of online learning and stochastic optimization, we provide theoretical analysis that motivates the design of our 2TS model. The 2TS model achieves a nice bias-variance trade-off while being computationally efficient. In large scale datasets, our 2TS model is able to achieve significantly better recommendations than previous state-of-the-art, yet being much more computationally efficient. ",
+    "title": "Learning Two-Time-Scale Representations For Large Scale Recommendations",
+    "authors": [
+      "Xinshi Chen",
+      "Yan Zhu",
+      "Haowen Xu",
+      "Muhan Zhang",
+      "Liang Xiong",
+      "Le Song"
+    ],
+    "emails": [
+      "~Haowen_Xu1",
+      "~Liang_Xiong1",
+      "~Xinshi_Chen1",
+      "~Le_Song1",
+      "fb.com"
+    ],
+    "rank": 1421
+  },
+  {
+    "url": "https://openreview.net/forum?id=pGIHq1m7PU",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      6,
+      1,
+      6
+    ],
+    "rating": "5.39",
+    "confidences": [
+      4,
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Modeling time-evolving knowledge graphs (KGs) has recently gained increasing interest. Here, graph representation learning has become the dominant paradigm for link prediction on temporal KGs. However, the embedding-based approaches largely operate in a black-box fashion, lacking the ability to interpret their predictions. This paper provides a link forecasting framework that reasons over query-relevant subgraphs of temporal KGs and jointly models the structural dependencies and the temporal dynamics. Especially, we propose a temporal relational attention mechanism and a novel reverse representation update scheme to guide the extraction of an enclosing subgraph around the query. The subgraph is expanded by an iterative sampling of temporal neighbors and by attention propagation. Our approach provides human-understandable evidence explaining the forecast. We evaluate our model on four benchmark temporal knowledge graphs for the link forecasting task. While being more explainable, our model obtains a relative improvement of up to 20 <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on Hits@1 compared to the previous best temporal KG forecasting method. We also conduct a survey with 53 respondents, and the results show that the evidence extracted by the model for link forecasting is aligned with human understanding. ",
+    "title": "Explainable Subgraph Reasoning for Forecasting on Temporal Knowledge Graphs",
+    "authors": [
+      "Zhen Han",
+      "Peng Chen",
+      "Yunpu Ma",
+      "Volker Tresp"
+    ],
+    "emails": [
+      "~Yunpu_Ma1",
+      "~Zhen_Han3",
+      "~Volker_Tresp1",
+      "tum.de"
+    ],
+    "rank": 1422
+  },
+  {
+    "url": "https://openreview.net/forum?id=MCe-j2-mVnA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7
+    ],
+    "rating": "5.38",
+    "confidences": [
+      2,
+      3,
+      3
+    ],
+    "abstract": "In this work we focus on general-purpose learned optimizers capable of training a wide variety of problems with no user-specified hyperparameters. We introduce a new, neural network parameterized, hierarchical optimizer with access to additional features such as validation loss to enable automatic regularization. Most learned optimizers have been trained on only a single task, or a small number of tasks. We train our optimizers on thousands of tasks, making use of orders of magnitude more compute, resulting in optimizers that generalize better to unseen tasks. The learned optimizers not only perform well, but learn behaviors that are distinct from existing first order optimizers. For instance, they generate update steps that have implicit regularization and adapt as the problem hyperparameters (e.g. batch size) or architecture (e.g. neural network width) change. Finally, these learned optimizers show evidence of being useful for out of distribution tasks such as training themselves from scratch.",
+    "title": "Overcoming barriers to the training of effective learned optimizers",
+    "authors": [
+      "Luke Metz",
+      "Niru Maheswaranathan",
+      "C. Daniel Freeman",
+      "Ben Poole",
+      "Jascha Sohl-Dickstein"
+    ],
+    "emails": [
+      "~C._Daniel_Freeman1",
+      "~Jascha_Sohl-Dickstein2",
+      "~Niru_Maheswaranathan1",
+      "~Luke_Metz1",
+      "~Ben_Poole1"
+    ],
+    "rank": 1423
+  },
+  {
+    "url": "https://openreview.net/forum?id=JydXRRDoDTv",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      7
+    ],
+    "rating": "5.38",
+    "confidences": [
+      4,
+      4,
+      1,
+      4
+    ],
+    "abstract": "Although policy optimization with neural networks has a track record of achieving state-of-the-art results in reinforcement learning on various domains, the theoretical understanding of the computational and sample efficiency of policy optimization remains restricted to linear function approximations with finite-dimensional feature representations, which hinders the design of principled, effective, and efficient algorithms. To this end, we propose an optimistic model-based policy optimization algorithm, which allows general function approximations while incorporating~exploration. In the episodic setting, we establish a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msqrt><mi>T</mi></msqrt></math></mjx-assistive-mml></mjx-container>-regret that scales polynomially in the eluder dimension of the general model class. Here <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the number of steps taken by the agent. In particular, we specialize such a regret to handle two nonparametric model classes; one based on reproducing kernel Hilbert spaces and another based on overparameterized neural networks.",
+    "title": "Optimistic Policy Optimization with General Function Approximations",
+    "authors": [
+      "Qi Cai",
+      "Zhuoran Yang",
+      "Csaba Szepesvari",
+      "Zhaoran Wang"
+    ],
+    "emails": [
+      "~Csaba_Szepesvari1",
+      "~Zhaoran_Wang1",
+      "~Zhuoran_Yang1",
+      "~Qi_Cai2"
+    ],
+    "rank": 1424
+  },
+  {
+    "url": "https://openreview.net/forum?id=trYkgJMOXhy",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.38",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Increasing evidences has shown that data biases towards sensitive features such as gender or race are often inherited or even amplified by machine learning models. Recent advancements in fairness mitigate such biases by adjusting the predictions across sensitive groups during the training. Such a correction, however, can only take advantage of samples in a fixed dataset, which usually has limited amount of samples for the minority groups. We propose a generative fairness teaching framework that provides a model with not only real samples but also synthesized samples to compensate the data biases during training. We employ such a teaching strategy by implementing a Generative Fairness Teacher (GFT) that dynamically adjust the proportion of training data for a biased student model. Experimental results indicated that our teacher model is capable of guiding a wide range of biased models by improving  the fairness and performance trade-offs significantly.",
+    "title": "Generative Fairness Teaching",
+    "authors": [
+      "Rongmei Lin",
+      "Hanjun Dai",
+      "Li Xiong",
+      "Wei Wei"
+    ],
+    "emails": [
+      "~Hanjun_Dai1",
+      "~Wei_Wei15",
+      "~Li_Xiong1",
+      "~Rongmei_Lin1"
+    ],
+    "rank": 1425
+  },
+  {
+    "url": "https://openreview.net/forum?id=jH7wTMOYvbw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.38",
+    "confidences": [
+      4,
+      4,
+      4,
+      1
+    ],
+    "abstract": "While massage passing based Graph Neural Networks (GNNs) have become increasingly popular architectures for learning with graphs, recent works have revealed important shortcomings in their expressive power. In response, several higher-order GNNs have been proposed, which substantially increase the expressive power, but at a large computational cost.\n  Motivated by this gap, we introduce and analyze a new recursive pooling technique of local neighborhoods that allows different tradeoffs of computational cost and expressive power. First, we show that this model can count subgraphs of size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>, and thereby overcomes a known limitation of low-order GNNs. Second, we prove that, in several cases, RNP-GNNs can greatly reduce computational complexity compared to the existing higher-order <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-GNN and Local Relational Pooling (LRP) networks. \n ",
+    "title": "Recursive Neighborhood Pooling for Graph Representation Learning",
+    "authors": [
+      "Behrooz Tahmasebi",
+      "Stefanie Jegelka"
+    ],
+    "emails": [
+      "~Behrooz_Tahmasebi1",
+      "~Stefanie_Jegelka3"
+    ],
+    "rank": 1426
+  },
+  {
+    "url": "https://openreview.net/forum?id=enhd0P_ERBO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.38",
+    "confidences": [
+      3,
+      5,
+      5
+    ],
+    "abstract": "Reinforcement learning has been used to learn to solve various routing problems. however, most of the algorithm is restricted to finding an optimal routing strategy for only a single vehicle. In addition, the trained policy under a specific target routing problem is not able to solve different types of routing problems with different objectives and constraints. This paper proposes an reinforcement learning approach to solve the min-max capacitated multi vehicle routing problem (mCVRP), the problem seeks to minimize the total completion time for multiple vehicles whose one-time traveling distance is constrained by their fuel levels to serve the geographically distributed customer nodes. The method represents the relationships among vehicles, customers, and fuel stations using relationship-specific graphs to consider their topological relationships and employ graph neural network (GNN) to extract the graph's embedding to be used to make a routing action. We train the proposed model using the random mCVRP instance with different numbers of vehicles, customers, and refueling stations. We then validate that the trained policy solve not only new mCVRP problems with different complexity (weak transferability but also different routing problems (CVRP, mTSP, TSP) with different objectives and constraints (storing transferability). ",
+    "title": "Learning a Transferable Scheduling Policy for Various Vehicle Routing Problems based on Graph-centric Representation Learning",
+    "authors": [
+      "Inwook Kim",
+      "Jinkyoo Park"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Jinkyoo_Park1"
+    ],
+    "rank": 1427
+  },
+  {
+    "url": "https://openreview.net/forum?id=ysti0DEWTSo",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.38",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "It has been recognized that a heavily overparameterized artificial neural network exhibits surprisingly good generalization performance in various machine-learning tasks. Recent theoretical studies have made attempts to unveil the mystery of the overparameterization. In most of those previous works, the overparameterization is achieved by increasing the width of the network, while the effect of increasing the depth has been less well understood. In this work, we investigate the effect of increasing the depth within an overparameterized regime. To gain an insight into the advantage of depth, we introduce local and global labels as abstract but simple classification rules. It turns out that the locality of the relevant feature for a given classification rule plays an important role; our experimental results suggest that deeper is better for local labels, whereas shallower is better for global labels. We also compare the results of finite networks with those of the neural tangent kernel (NTK), which is equivalent to an infinitely wide network with a proper initialization and an infinitesimal learning rate. It is shown that the NTK does not correctly capture the depth dependence of the generalization performance, which indicates the importance of the feature learning, rather than the lazy learning.",
+    "title": "Is deeper better? It depends on locality of relevant features",
+    "authors": [
+      "Takashi Mori",
+      "Masahito Ueda"
+    ],
+    "emails": [
+      "~Takashi_Mori1",
+      "phys.s.u-tokyo.ac.jp"
+    ],
+    "rank": 1428
+  },
+  {
+    "url": "https://openreview.net/forum?id=aCgLmfhIy_f",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.38",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Recognizing relations between entities is a pivotal task of relational learning.  \nLearning relation representations from distantly-labeled datasets is difficult because of the abundant label noise and complicated expressions in human language.  \nThis paper aims to learn predictive, interpretable, and robust relation representations from distantly-labeled data that are effective in different settings, including supervised, distantly supervised, and few-shot learning. \nInstead of solely relying on the supervision from noisy labels, we propose to learn prototypes for each relation from contextual information to best explore the intrinsic semantics of relations. \nPrototypes are representations in the feature space abstracting the essential semantics of relations between entities in sentences.\nWe learn prototypes based on objectives with clear geometric interpretation, where the prototypes are unit vectors uniformly dispersed in a unit ball, and statement embeddings are centered at the end of their corresponding prototype vectors on the surface of the ball. \nThis approach allows us to learn meaningful, interpretable prototypes for the final classification.\nResults on several relation learning tasks show that our model significantly outperforms the previous state-of-the-art models.\nWe further demonstrate the robustness of the encoder and the interpretability of prototypes with extensive experiments.",
+    "title": "Prototypical Representation Learning for Relation Extraction",
+    "authors": [
+      "Ning Ding",
+      "Xiaobin Wang",
+      "Yao Fu",
+      "Guangwei Xu",
+      "Rui Wang",
+      "Pengjun Xie",
+      "Ying Shen",
+      "Fei Huang",
+      "Hai-Tao Zheng",
+      "Rui Zhang"
+    ],
+    "emails": [
+      "~Guangwei_Xu2",
+      "~Yao_Fu3",
+      "~Ying_Shen3",
+      "~Xiaobin_Wang1",
+      "~Ning_Ding5",
+      "~Rui_Zhang11",
+      "~Pengjun_Xie2",
+      "~Rui_Wang16",
+      "~Hai-Tao_Zheng2",
+      "~Fei_Huang2"
+    ],
+    "rank": 1429
+  },
+  {
+    "url": "https://openreview.net/forum?id=fB2GZQajQ2b",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.38",
+    "confidences": [
+      3,
+      2,
+      3
+    ],
+    "abstract": "In this paper, we propose a unified information-theoretic framework for odometry learning, a crucial component of many robotics and vision tasks such as navigation and virtual reality where 6-DOF poses are required in real time. We formulate this problem as optimizing a variational information bottleneck objective function, which eliminates pose-irrelevant information from the latent representation. The proposed framework provides an elegant tool for performance evaluation and understanding in information-theoretic language. Specifically, we bound the generalization errors of the deep information bottleneck framework and the predictability of the latent representation. These provide not only a performance guarantee but also practical guidance for model design, sample collection, and sensor selection. Furthermore, the stochastic latent representation provides a natural uncertainty measure without the needs for extra structures or computations. Experiments on two well-known odometry datasets demonstrate the effectiveness of our method.",
+    "title": "Information-Theoretic Odometry Learning",
+    "authors": [
+      "Sen Zhang",
+      "Jing Zhang",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Sen_Zhang3",
+      "~Jing_Zhang17",
+      "~Dacheng_Tao1"
+    ],
+    "rank": 1430
+  },
+  {
+    "url": "https://openreview.net/forum?id=cU0a02VF8ZG",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.38",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Machine translation in a multi-language scenario requires large-scale parallel corpora  for  every  language  pair.   Unsupervised  translation  is  challenging  because there is no explicit connection between languages, and the existing methods have to rely on topological properties of the language representations.  We introduce a framework that leverages visual similarity to align multiple languages, using images as the bridge between them. We estimate the cross-modal alignment between language and images, and use this estimate to guide the learning of cross-lingual representations.   Our  language  representations  are  trained  jointly  in  one  model with a single stage.  Experiments with fifty-two languages show that our method outperforms prior work on unsupervised word-level and sentence-level translation using retrieval.",
+    "title": "Globetrotter: Unsupervised Multilingual Translation from Visual Alignment",
+    "authors": [
+      "Didac Suris Coll-Vinent",
+      "Dave Epstein",
+      "Carl Vondrick"
+    ],
+    "emails": [
+      "~Dave_Epstein1",
+      "~Carl_Vondrick2",
+      "~Didac_Suris_Coll-Vinent1"
+    ],
+    "rank": 1431
+  },
+  {
+    "url": "https://openreview.net/forum?id=Tio_oO2ga3u",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      8,
+      6
+    ],
+    "rating": "5.38",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Gaussian processes (GPs) are nonparametric Bayesian models that are both flexible and robust to overfitting. One of the main challenges of GP methods is selecting the kernel. In the deep kernel learning (DKL) paradigm, a deep neural network or ``feature network'' is used to map inputs into a latent feature space, where a GP with a ``base kernel'' acts; the resulting model is then trained in an end-to-end fashion. In this work, we introduce the ``deep ensemble kernel learning'' (DEKL) model, which is a special case of DKL. In DEKL, a linear base kernel is used, enabling exact optimization of the base kernel hyperparameters and a scalable inference method that does not require approximation by inducing points. We also represent the feature network as a concatenation of an ensemble of learner networks with a common architecture, allowing for easy model parallelism. We show that DEKL is able to approximate any kernel if the number of learners in the ensemble is arbitrarily large. Comparing the DEKL model to DKL and deep ensemble (DE) baselines on both synthetic and real-world regression tasks, we find that DEKL often outperforms both baselines in terms of predictive performance and that the DEKL learners tend to be more diverse (i.e., less correlated with one another) compared to the DE learners.",
+    "title": "Deep Ensemble Kernel Learning",
+    "authors": [
+      "Devanshu Agrawal",
+      "Jacob D Hinkle"
+    ],
+    "emails": [
+      "~Jacob_D_Hinkle1",
+      "vols.utk.edu"
+    ],
+    "rank": 1432
+  },
+  {
+    "url": "https://openreview.net/forum?id=j0p8ASp9Br",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      3
+    ],
+    "rating": "5.38",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Safety-critical decisions based on machine learning models require a clear understanding of the involved uncertainties to avoid hazardous or risky situations. While aleatoric uncertainty can be explicitly modeled given a parametric description, epistemic uncertainty rather describes the presents or absence of training data. This paper proposes a novel generic method for modeling \nepistemic uncertainty and shows its advantages over existing approaches for neural networks on various data sets. It can be directly combined with aleatoric uncertainty estimates and \nallows for prediction in real-time as the inference is sample-free. We exploit this property in a model-based quadcopter control setting and demonstrate how the controller benefits from a differentiation between aleatoric and epistemic uncertainty in online learning of thermal disturbances.",
+    "title": "Real-time Uncertainty Decomposition for Online Learning Control",
+    "authors": [
+      "Jonas Umlauft",
+      "Armin Lederer",
+      "Thomas Beckers",
+      "Sandra Hirche"
+    ],
+    "emails": [
+      "~Jonas_Umlauft1",
+      "~Sandra_Hirche1",
+      "~Thomas_Beckers1",
+      "tum.de"
+    ],
+    "rank": 1433
+  },
+  {
+    "url": "https://openreview.net/forum?id=OBI5QuStBz3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.38",
+    "confidences": [
+      2,
+      3,
+      3
+    ],
+    "abstract": "Motivated by the interest in communication-efficient methods for distributed machine learning, we consider the communication complexity of minimising a sum of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container>-dimensional functions <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-munderover limits=\"false\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2211 TEX-S1\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: -0.285em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.291em;\"></mjx-spacer><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-munderover><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><munderover><mo data-mjx-texclass=\"OP\">\u2211</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></munderover><msub><mi>f</mi><mi>i</mi></msub><mo stretchy=\"false\">(</mo><mi>x</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where each function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>f</mi><mi>i</mi></msub></math></mjx-assistive-mml></mjx-container> is held by one of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> different machines. Such tasks arise naturally in  large-scale optimisation, where a standard solution is to apply variants of  (stochastic) gradient descent.  As our main result, we show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">(</mo><mi>N</mi><mi>d</mi><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>d</mi><mrow><mo>/</mo></mrow><mi>\u03b5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> bits in total need to be communicated between the machines to find an additive <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-approximation to the minimum of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-munderover limits=\"false\"><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c2211 TEX-S1\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: -0.285em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-spacer style=\"margin-top: 0.291em;\"></mjx-spacer><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-munderover><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><munderover><mo data-mjx-texclass=\"OP\">\u2211</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></munderover><msub><mi>f</mi><mi>i</mi></msub><mo stretchy=\"false\">(</mo><mi>x</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. The results holds for deterministic algorithms, and randomised algorithms under some restrictions on the parameter values. Importantly, our lower bounds require no assumptions on the structure of the algorithm, and are matched within constant factors for strongly convex objectives by a new variant of quantised gradient descent. The lower bounds are obtained by bringing over tools from communication complexity to distributed optimisation, an approach we hope will find further use in future.\n",
+    "title": "Improved Communication Lower Bounds for Distributed Optimisation",
+    "authors": [
+      "Janne H. Korhonen",
+      "Dan Alistarh"
+    ],
+    "emails": [
+      "~Dan_Alistarh7",
+      "~Janne_H._Korhonen2"
+    ],
+    "rank": 1434
+  },
+  {
+    "url": "https://openreview.net/forum?id=kic8cng35wX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.38",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Neural Architecture Search (NAS) finds the best network architecture by exploring the architecture-to-performance manifold. It often trains and evaluates a large amount of architectures, causing tremendous computation cost. Recent predictor-based NAS approaches attempt to solve this problem with two key steps: sampling some architecture-performance pairs and fitting a proxy accuracy predictor. Existing predictors attempt to model the performance distribution over the whole architecture space, which could be too challenging given limited samples. Instead, we envision that this ambitious goal may not be necessary if the final aim is to find the best architecture. We present a novel framework to estimate weak predictors progressively. Rather than expecting a single strong predictor to model the whole space, we seek a progressive line of weak predictors that can connect a path to the best architecture, thus greatly simplifying the learning task of each predictor. It is based on the key property of the predictors that their probabilities of sampling better architectures will keep increasing. We thus only sample a few well-performed architectures guided by the predictive model, to estimate another better weak predictor. By this coarse-to-fine iteration, the ranking of sampling space is refined gradually, which helps find the optimal architectures eventually. Experiments demonstrate that our method costs fewer samples to find the top-performance architectures on NAS-Bench-101 and NAS-Bench-201, and it achieves the state-of-the-art ImageNet performance on the NASNet search space.",
+    "title": "Weak NAS Predictor Is All You Need",
+    "authors": [
+      "Junru Wu",
+      "Xiyang Dai",
+      "Dongdong Chen",
+      "Yinpeng Chen",
+      "Mengchen Liu",
+      "Ye Yu",
+      "Zhangyang Wang",
+      "Zicheng Liu",
+      "Mei Chen",
+      "Lu Yuan"
+    ],
+    "emails": [
+      "~Mei_Chen2",
+      "~Zhangyang_Wang1",
+      "~Lu_Yuan1",
+      "~Junru_Wu2",
+      "~Yinpeng_Chen1",
+      "~Dongdong_Chen1",
+      "~Mengchen_Liu2",
+      "~Xiyang_Dai2",
+      "~Zicheng_Liu1",
+      "~Ye_Yu2"
+    ],
+    "rank": 1435
+  },
+  {
+    "url": "https://openreview.net/forum?id=o1O5nc48rn",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.38",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Current graph neural network (GNN) architectures naively average or sum node embeddings into an aggregated graph representation---potentially losing structural or semantic information. We here introduce OT-GNN, a model that computes graph embeddings using parametric prototypes that highlight key facets of different graph aspects. Towards this goal, we are (to our knowledge) the first to successfully combine optimal transport with parametric graph models. Graph representations are obtained from Wasserstein distances between the set of GNN node embeddings and \"prototype\" point clouds as free parameters. We theoretically prove that, unlike traditional sum aggregation, our function class on point clouds satisfies a fundamental universal approximation theorem. Empirically, we address an inherent collapse optimization issue by proposing a noise contrastive regularizer to steer the model towards truly exploiting the optimal transport geometry. Finally, we consistently report better generalization performance on several molecular property prediction tasks, while exhibiting smoother graph representations.",
+    "title": "Optimal Transport Graph Neural Networks",
+    "authors": [
+      "Gary B\u00e9cigneul",
+      "Octavian-Eugen Ganea",
+      "Benson Chen",
+      "Regina Barzilay",
+      "Tommi S. Jaakkola"
+    ],
+    "emails": [
+      "~Octavian-Eugen_Ganea1",
+      "~Gary_B\u00e9cigneul1",
+      "~Regina_Barzilay1",
+      "~Benson_Chen1",
+      "~Tommi_S._Jaakkola1"
+    ],
+    "rank": 1436
+  },
+  {
+    "url": "https://openreview.net/forum?id=3F0Qm7TzNDM",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      3,
+      7
+    ],
+    "rating": "5.38",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In the context of supervised learning of a function by a Neural Network (NN), we claim and empirically justify that a NN yields better results when the distribution of the data set focuses on regions where the function to learn is steeper. We first traduce this assumption in a mathematically workable way using Taylor expansion. Then, theoretical derivations allow to construct a methodology that we call Variance Based Samples Weighting (VBSW). VBSW uses local variance of the labels to weight the training points. This methodology is general, scalable, cost effective, and significantly increases the performances of a large class of models for various classification and regression tasks on image, text and multivariate data.  We highlight its benefits with experiments involving NNs from shallow linear NN to ResNet or Bert.",
+    "title": "Variance Based Sample Weighting for Supervised Deep Learning",
+    "authors": [
+      "Paul Novello",
+      "Ga\u00ebl Po\u00ebtte",
+      "David Lugato",
+      "Pietro Congedo"
+    ],
+    "emails": [
+      "~Paul_Novello1",
+      "cea.fr",
+      "inria.fr"
+    ],
+    "rank": 1437
+  },
+  {
+    "url": "https://openreview.net/forum?id=cbdp6RLk2r7",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      3,
+      7,
+      5
+    ],
+    "rating": "5.37",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "A core challenge in Machine Learning is to disentangle natural factors of variation in data (e.g. object shape vs pose).  A popular approach to disentanglement consists in learning to map each of these factors to distinct subspaces of a model's latent representation. However, this approach has shown limited empirical success to date. Here, we show that this approach to disentanglement introduces topological defects (i.e. discontinuities in the encoder) for a broad family of transformations acting on images ---encompassing simple affine transformations such as rotations and translations.  Moreover, motivated by classical results from group representation theory, we propose an alternative, more flexible approach to disentanglement which relies on distributed equivariant operators, potentially acting on the entire latent space. We theoretically and empirically demonstrate the effectiveness of our approach to disentangle affine transformations. Our work lays a theoretical foundation for the recent success of a new generation of models using distributed operators for disentanglement (see Discussion).",
+    "title": "Addressing the Topological Defects of Disentanglement",
+    "authors": [
+      "Diane Bouchacourt",
+      "Mark Ibrahim",
+      "Stephane Deny"
+    ],
+    "emails": [
+      "~Stephane_Deny1",
+      "fb.com",
+      "~Diane_Bouchacourt3"
+    ],
+    "rank": 1438
+  },
+  {
+    "url": "https://openreview.net/forum?id=JbAqsfbYsJy",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      3,
+      7
+    ],
+    "rating": "5.36",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We introduce a unified objective for action and perception of intelligent agents. Extending representation learning and control, we minimize the joint divergence between the combined system of agent and environment and a target distribution. Intuitively, such agents use perception to align their beliefs with the world, and use actions to align the world with their beliefs. Minimizing the joint divergence to an expressive target maximizes the mutual information between the agent's representations and inputs, thus inferring representations that are informative of past inputs and exploring future inputs that are informative of the representations. This lets us explain intrinsic objectives, such as representation learning, information gain, empowerment, and skill discovery from minimal assumptions. Moreover, interpreting the target distribution as a latent variable model suggests powerful world models as a path toward highly adaptive agents that seek large niches in their environments, rendering task rewards optional. The framework provides a common language for comparing a wide range of objectives, advances the understanding of latent variables for decision making, and offers a recipe for designing novel objectives. We recommend deriving future agent objectives the joint divergence to facilitate comparison, to point out the agent's target distribution, and to identify the intrinsic objective terms needed to reach that distribution.",
+    "title": "Action and Perception as Divergence Minimization",
+    "authors": [
+      "Danijar Hafner",
+      "Pedro A Ortega",
+      "Jimmy Ba",
+      "Thomas Parr",
+      "Karl Friston",
+      "Nicolas Heess"
+    ],
+    "emails": [
+      "ucl.ac.uk",
+      "~Pedro_A_Ortega1",
+      "~Nicolas_Heess1",
+      "~Danijar_Hafner1",
+      "~Jimmy_Ba1",
+      "~Karl_Friston1"
+    ],
+    "rank": 1439
+  },
+  {
+    "url": "https://openreview.net/forum?id=C5kn825mU19",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      3,
+      3,
+      2
+    ],
+    "abstract": "In real-world multi-agent teams, agents with different capabilities may join or leave \"on the fly\" without altering the team's overarching goals. Coordinating teams with such dynamic composition remains a challenging problem: the optimal team strategy may vary with its composition. Inspired by real-world team sports, we propose a coach-player framework to tackle this problem. We assume that the players only have a partial view of the environment, while the coach has a complete view. The coach coordinates the players by distributing individual strategies. Specifically, we 1) propose an attention mechanism for both the players and the coach; 2) incorporate a variational objective to regularize  learning; and 3) design an adaptive communication method to let the coach decide when to communicate with different players. Our attention mechanism on the players and the coach allows for a varying number of heterogeneous agents, and can thus tackle the dynamic team composition.  We validate our methods on resource collection tasks in multi-agent particle environment. We demonstrate zero-shot generalization to new team compositions with varying numbers of heterogeneous  agents. The performance of our method is comparable or even better than the  setting where all players have a full view of the environment, but no coach. Moreover, we see that the performance stays nearly the same even when the coach communicates as little as 13% of the time using our adaptive communication strategy.  These results demonstrate   the significance of a coach to coordinate players in dynamic teams.",
+    "title": "A Coach-Player Framework for Dynamic Team Composition",
+    "authors": [
+      "Bo Liu",
+      "qiang liu",
+      "Peter Stone",
+      "Animesh Garg",
+      "Yuke Zhu",
+      "Anima Anandkumar"
+    ],
+    "emails": [
+      "~Yuke_Zhu1",
+      "~qiang_liu4",
+      "~Bo_Liu13",
+      "~Anima_Anandkumar1",
+      "~Peter_Stone1",
+      "~Animesh_Garg1"
+    ],
+    "rank": 1440
+  },
+  {
+    "url": "https://openreview.net/forum?id=meG3o0ttiAD",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Quantum Neural Networks (QNNs) have been recently proposed as generalizations of classical neural networks to achieve the quantum speed-up. Despite the potential to outperform classical models, serious bottlenecks exist for training QNNs; namely, QNNs with random structures have poor trainability due to the vanishing gradient with rate exponential to the input qubit number. The vanishing gradient could seriously influence the applications of large-size QNNs. In this work, we provide the first viable solution with theoretical guarantees. Specifically, we prove that QNNs with tree tensor and step controlled architectures have gradients that vanish at most polynomially with the qubit number. Moreover, our result holds irrespective of which encoding methods are employed. We numerically demonstrate QNNs with tree tensor and step controlled structures for the application of binary classification. Simulations show faster convergent rates and better accuracy compared to QNNs with random structures.",
+    "title": "Toward Trainability of Quantum Neural Networks",
+    "authors": [
+      "Kaining Zhang",
+      "Min-Hsiu Hsieh",
+      "Liu Liu",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Kaining_Zhang1",
+      "uts.edu.au",
+      "~Liu_Liu8",
+      "~Dacheng_Tao1"
+    ],
+    "rank": 1441
+  },
+  {
+    "url": "https://openreview.net/forum?id=OLOr1K5zbDu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      4,
+      5,
+      2
+    ],
+    "abstract": "The record-breaking performance and prohibitive complexity of deep neural networks (DNNs) have ignited a substantial need for customized DNN accelerators which have the potential to boost DNN acceleration efficiency by orders-of-magnitude. While it has been recognized that maximizing DNNs' acceleration efficiency requires a joint design/search for three different yet highly coupled aspects, including the networks, adopted precision, and their accelerators, the challenges associated with such a joint search have not yet been fully discussed and addressed. First, to jointly search for a network and its precision via differentiable search, there exists a dilemma of whether to explode the memory consumption or achieve sub-optimal designs. Second, a generic and differentiable joint search of the networks and their accelerators is non-trivial due to (1) the discrete nature of the accelerator space and (2) the difficulty of obtaining operation-wise hardware cost penalties because some accelerator parameters are determined by the whole network. To this end, we propose a Triple-Search (TRIPS) framework to address the aforementioned challenges towards jointly searching for the network structure, precision, and accelerator in a differentiable manner, to efficiently and effectively explore the huge joint search space. Our TRIPS addresses the first challenge above via a heterogeneous sampling strategy to achieve unbiased search with constant memory consumption, and tackles the latter one using a novel co-search pipeline that integrates a generic differentiable accelerator search engine. Extensive experiments and ablation studies validate that both TRIPS generated networks and accelerators consistently outperform state-of-the-art (SOTA) designs (including co-search/exploration techniques, hardware-aware NAS methods, and DNN accelerators), in terms of search time, task accuracy, and accelerator efficiency. All codes will be released upon acceptance.",
+    "title": "Triple-Search: Differentiable Joint-Search of Networks, Precision, and Accelerators",
+    "authors": [
+      "Yonggan Fu",
+      "Yongan Zhang",
+      "Haoran You",
+      "Yingyan Lin"
+    ],
+    "emails": [
+      "~Yingyan_Lin1",
+      "~Yonggan_Fu1",
+      "~Yongan_Zhang1",
+      "~Haoran_You1"
+    ],
+    "rank": 1442
+  },
+  {
+    "url": "https://openreview.net/forum?id=tADlrawCrVU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "We address the problem of self-supervised learning on discrete event sequences generated by real-world users. Self-supervised learning incorporates complex information from the raw data in low-dimensional fixed-length vector representations that could be easily applied in various downstream machine learning tasks. In this paper, we propose a new method CoLES, which adopts contrastive learning, previously used for audio and computer vision domains, to the discrete event sequences domain in a self-supervised setting. Unlike most previous studies, we theoretically justify under mild conditions that the augmentation method underlying CoLES provides representative samples of discrete event sequences. We evaluated CoLES on several public datasets and showed that CoLES representations consistently outperform other methods on different downstream tasks.\n",
+    "title": "CoLES: Contrastive learning for event sequences with self-supervision",
+    "authors": [
+      "Dmitrii Babaev",
+      "Nikita Ovsov",
+      "Ivan A Kireev",
+      "Gleb Gusev",
+      "Maria Ivanova",
+      "Alexander Tuzhilin"
+    ],
+    "emails": [
+      "~Nikita_Ovsov1",
+      "~Ivan_A_Kireev1",
+      "~Dmitrii_Babaev2",
+      "~Gleb_Gusev1",
+      "~Alexander_Tuzhilin1",
+      "~Maria_Ivanova2"
+    ],
+    "rank": 1443
+  },
+  {
+    "url": "https://openreview.net/forum?id=MxaY4FzOTa",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Network binarization is a promising hardware-aware direction for creating efficient deep models. Despite its memory and computational advantages, reducing the accuracy gap between binary models and their real-valued counterparts remains an unsolved challenging research problem. To this end, we make the following 3 contributions: (a) To increase model capacity, we propose Expert Binary Convolution, which, for the first time, tailors conditional computing to binary networks by learning to select one data-specific expert binary filter at a time conditioned on input features. (b) To increase representation capacity, we propose to address the inherent information bottleneck in binary networks by introducing an efficient width expansion mechanism which keeps the binary operations within the same budget. (c) To improve network design, we propose a principled binary network growth mechanism that unveils a set of network topologies of favorable properties. Overall, our method improves upon prior work, with no increase in computational cost, by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>6</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>, reaching a groundbreaking <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>71</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on ImageNet classification. Code will be made available $\\href{<a href=\"https://www.adrianbulat.com/binary-networks}{here}$\" target=\"_blank\" rel=\"nofollow\">https://www.adrianbulat.com/binary-networks}{here}$</a>.",
+    "title": "High-Capacity Expert Binary Networks",
+    "authors": [
+      "Adrian Bulat",
+      "Brais Martinez",
+      "Georgios Tzimiropoulos"
+    ],
+    "emails": [
+      "~Georgios_Tzimiropoulos1",
+      "~Adrian_Bulat1",
+      "~Brais_Martinez3"
+    ],
+    "rank": 1444
+  },
+  {
+    "url": "https://openreview.net/forum?id=N6SmiyDrkR5",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      8
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We propose a novel method for exploring how neurons within a neural network interact. In particular, we consider activation values of a network for given data, and propose to mine noise-robust rules of the form <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D44C TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>X</mi><mo stretchy=\"false\">\u2192</mo><mi>Y</mi></math></mjx-assistive-mml></mjx-container> , where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>X</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44C TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Y</mi></math></mjx-assistive-mml></mjx-container> are sets of neurons in different layers. To ensure we obtain a small and non-redundant set of high quality rules, we formalize the problem in terms of the Minimum Description Length principle, by which we identify the best set of rules as the one that best compresses the activation data. To discover good rule sets, we propose the unsupervised ExplaiNN algorithm. Extensive evaluation shows that our rules give clear insight in how networks perceive the world: they identify shared, resp. class-specific traits, compositionality within the network, as well as locality in convolutional layers. Our rules are easily interpretable, but also super-charge prototyping as they identify which groups of neurons to consider in unison.",
+    "title": "What's in the Box? Exploring the Inner Life of Neural Networks with Robust Rules",
+    "authors": [
+      "Jonas Fischer",
+      "Anna Ol\u00e1h",
+      "Jilles Vreeken"
+    ],
+    "emails": [
+      "~Jonas_Fischer1",
+      "mmci.uni-saarland.de",
+      "cispa.de"
+    ],
+    "rank": 1445
+  },
+  {
+    "url": "https://openreview.net/forum?id=5Dj8rVRg9Ui",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.36",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Effective long-range forecasting of time series data remains an unsolved and open problem. One possible approach is to use generative models to improve long-range forecasting, but the challenge then is how to generate high-quality synthetic data. In this paper, we propose a conditional Wasserstein GAN with Gradient and Error Penalty (cWGAN-GEP), aiming to generate accurate synthetic data that preserves the temporal dynamics between the conditioning input and generated data. By using such synthetic data, we develop a long-range forecasting method called Generative Forecasting (GenF). GenF consists of three key components: (i) a cWGAN-GEP based generator, to generate synthetic data for next few time steps. (ii) a predictor which makes long-range predictions based on generated and observed data. (iii) an information theoretic clustering (ITC) algorithm to better train the cWGAN-GEP based generator and the predictor. Our experimental results on three public datasets demonstrate that GenF significantly outperforms a diverse range of state-of-the-art benchmarks and classical approaches. In most cases, we find an improvement of at least 10% over all studied methods. Lastly, we conduct an ablation study to demonstrate the effectiveness of the cWGAN-GEP and the ITC algorithm.",
+    "title": "Using Synthetic Data to Improve the Long-range Forecasting of Time Series Data",
+    "authors": [
+      "Shiyu Liu",
+      "Mehul Motani"
+    ],
+    "emails": [
+      "~Mehul_Motani1",
+      "~Shiyu_Liu1"
+    ],
+    "rank": 1446
+  },
+  {
+    "url": "https://openreview.net/forum?id=ARQAdp7F8OQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.36",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Unsupervised learning of hidden representations has been one of the most vibrant research directions in machine learning in recent years. In this work we study the brain-like Bayesian Confidence Propagating Neural Network (BCPNN) model, recently extended to extract sparse distributed high-dimensional representations. The saliency and separability of the hidden representations when trained on MNIST dataset is studied using an external linear classifier and compared with other unsupervised learning methods that include restricted Boltzmann machines and autoencoders. ",
+    "title": "Brain-like approaches to unsupervised learning of hidden representations - a comparative study ",
+    "authors": [
+      "Naresh Balaji",
+      "Anders Lansner",
+      "Pawel Herman"
+    ],
+    "emails": [
+      "kth.se",
+      "~Naresh_Balaji1"
+    ],
+    "rank": 1447
+  },
+  {
+    "url": "https://openreview.net/forum?id=sMEpviTLi1h",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      3
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      2,
+      4,
+      2
+    ],
+    "abstract": "Bilevel optimization has arisen as a powerful tool for many machine learning problems such as meta-learning, hyperparameter optimization, and reinforcement learning. In this paper, we investigate the nonconvex-strongly-convex bilevel optimization problem. For deterministic bilevel optimization, we provide a comprehensive finite-time convergence analysis for two popular algorithms respectively based on approximate implicit differentiation (AID) and iterative differentiation (ITD). For the AID-based method, we orderwisely improve the previous finite-time convergence analysis due to a more practical parameter selection as well as a warm start strategy, and for the ITD-based method we establish the first theoretical convergence rate. Our analysis also provides a quantitative comparison between ITD and AID based approaches. For stochastic bilevel optimization, we propose a novel algorithm named stocBiO, which features a sample-efficient hypergradient estimator using efficient Jacobian- and Hessian-vector product computations. We provide the finite-time convergence guarantee for stocBiO, and show that stocBiO outperforms the best known computational complexities orderwisely with respect to the condition number <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03ba</mi></math></mjx-assistive-mml></mjx-container> and the target accuracy <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>. We further validate our theoretical results and demonstrate the efficiency of bilevel optimization  algorithms by the experiments on meta-learning and hyperparameter optimization.",
+    "title": "Provably Faster Algorithms for Bilevel Optimization and Applications to Meta-Learning",
+    "authors": [
+      "Kaiyi Ji",
+      "Junjie Yang",
+      "Yingbin Liang"
+    ],
+    "emails": [
+      "~Yingbin_Liang1",
+      "~Kaiyi_Ji1",
+      "~Junjie_Yang2"
+    ],
+    "rank": 1448
+  },
+  {
+    "url": "https://openreview.net/forum?id=iktA2PtTRsK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      4
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Recent unsupervised representation learning techniques show remarkable success on many single image tasks by using instance discrimination:  learning to differentiate between two augmented versions of the same image and a large batch of unrelated images.  Prior work uses artificial data augmentation techniques such as cropping, and color jitter which can only affect the image in superficial waysand are not aligned with how objects actually change e.g. occlusion, deformation,viewpoint change.  We argue that videos offer this natural augmentation for free.Videos can provide entirely new views of objects,  show deformation,  and even connect semantically similar but visually distinct concepts.We propose Video Noise Contrastive Estimation,  a method for using unlabeled video to learnstrong, transferable, single image representations.We demonstrate improvements over recent unsupervised single image techniques,as well as over fully supervised ImageNet pretraining, across temporal and non-temporal tasks.",
+    "title": "Watching the World Go By: Representation Learning from Unlabeled Videos",
+    "authors": [
+      "Daniel Gordon",
+      "Kiana Ehsani",
+      "Dieter Fox",
+      "Ali Farhadi"
+    ],
+    "emails": [
+      "~Kiana_Ehsani1",
+      "~Daniel_Gordon1",
+      "~Dieter_Fox1",
+      "~Ali_Farhadi3"
+    ],
+    "rank": 1449
+  },
+  {
+    "url": "https://openreview.net/forum?id=eNSpdJeR_J",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Protecting data privacy in deep learning (DL) is at its urgency. Several celebrated privacy notions have been established and used for privacy-preserving DL. However, many of the existing mechanisms achieve data privacy at the cost of significant utility degradation. In this paper, we propose a stochastic differential equation principled \\emph{residual perturbation} for privacy-preserving DL, which injects Gaussian noise into each residual mapping of ResNets. Theoretically, we prove that residual perturbation guarantees differential privacy (DP) and reduces the generalization gap for DL. Empirically, we show that residual perturbation outperforms the state-of-the-art DP stochastic gradient descent (DPSGD) in both membership privacy protection and maintaining the DL models' utility. For instance, in the process of training ResNet8 for the IDC dataset classification, residual perturbation obtains an accuracy of 85.7\\% and protects the perfect membership privacy; in contrast, DPSGD achieves an accuracy of 82.8\\% and protects worse membership privacy. ",
+    "title": "Deep Learning with Data Privacy via Residual Perturbation",
+    "authors": [
+      "Wenqi Tao",
+      "Huaming Ling",
+      "Zuoqiang Shi",
+      "Bao Wang"
+    ],
+    "emails": [
+      "~Zuoqiang_Shi1",
+      "mails.tsinghua.edu.cn",
+      "~Bao_Wang1"
+    ],
+    "rank": 1450
+  },
+  {
+    "url": "https://openreview.net/forum?id=76M3pxkqRl",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.36",
+    "confidences": [
+      2,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Individual rationality, which involves maximizing expected individual return, does not always lead to optimal individual or group outcomes in multi-agent problems. For instance, in social dilemma situations, Reinforcement Learning (RL) agents trained to maximize individual rewards converge to mutual defection that is individually and socially sub-optimal. In contrast, humans evolve individual and socially optimal strategies in such social dilemmas. Inspired by ideas from human psychology that attribute this behavior in humans to the status-quo bias, we present a status-quo loss (SQLoss) and the corresponding policy gradient algorithm that incorporates this bias in an RL agent. We demonstrate that agents trained with SQLoss evolve individually as well as socially optimal behavior in several social dilemma matrix games. To apply SQLoss to games where cooperation and defection are determined by a sequence of non-trivial actions, we present GameDistill, an algorithm that reduces a multi-step game with visual input to a matrix game. We empirically show how agents trained with SQLoss on a GameDistill reduced version of the Coin Game evolve optimal policies. ",
+    "title": "Status-Quo Policy Gradient in Multi-agent Reinforcement Learning",
+    "authors": [
+      "Pinkesh Badjatiya",
+      "Mausoom Sarkar",
+      "Abhishek Sinha",
+      "Nikaash Puri",
+      "Jayakumar Subramanian",
+      "Siddharth Singh",
+      "Balaji Krishnamurthy"
+    ],
+    "emails": [
+      "~Pinkesh_Badjatiya1",
+      "~Jayakumar_Subramanian1",
+      "~Mausoom_Sarkar1",
+      "gmail.com",
+      "~Balaji_Krishnamurthy1",
+      "~Nikaash_Puri1",
+      "~Abhishek_Sinha1"
+    ],
+    "rank": 1451
+  },
+  {
+    "url": "https://openreview.net/forum?id=Xxli_LIvYI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "   Approximation bounds for neural network pruning attempt to predict the trade-off between sparsity and fidelity while shrinking neural networks. In the first half of this paper, we empirically evaluate the predictive power of two recently proposed methods based on coreset algorithms. We identify several circumstances in which the bounds are loose or impractical to use and provide a brief analysis of the components of the bounds that contribute to those short-comings. In the second half, we examine the role of fine-tuning in prunability and observe that even tight approximation bounds would be poor predictors of accuracy after fine-tuning. This is because fine-tuning can recover large amounts of accuracy while simultaneously maintaining or increasing approximation error. We discuss the implications of these finding on the application of coreset-based pruning methods in practice and the role of approximation in the pruning community. Our code is available in the attached supplementary material.",
+    "title": "When Are Neural Pruning Approximation Bounds Useful?",
+    "authors": [
+      "Mitchell A Gordon"
+    ],
+    "emails": [
+      "~Mitchell_A_Gordon1"
+    ],
+    "rank": 1452
+  },
+  {
+    "url": "https://openreview.net/forum?id=_qoQkWNEhS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.36",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks (GNNs) rely heavily on the underlying graph topology and thus can be vulnerable to malicious attacks targeting at graph structures. We propose a novel GNN defense algorithm against structural attacks that maliciously modify graph topology. In particular, we discover a robust representation of the input graph based on the advanced theory of graph Ricci flow, which captures the intrinsic geometry of graphs and is robust to structural perturbation. We propose an algorithm to train GNNs using re-sampled graphs based on such geometric representation. We show that this method can be effective to protect against adversarial structural attacks. Our method achieves state-of-the-art performance on synthetic and real datasets against different types of graph poisoning attacks.",
+    "title": "Ricci-GNN: Defending Against Structural Attacks Through a Geometric Approach",
+    "authors": [
+      "Ze Ye",
+      "Tengfei Ma",
+      "Chien-Chun Ni",
+      "Kin Sum Liu",
+      "Jie Gao",
+      "Chao Chen"
+    ],
+    "emails": [
+      "~Tengfei_Ma1",
+      "verizonmedia.com",
+      "~Ze_Ye1",
+      "~Chao_Chen1",
+      "twitter.com",
+      "cs.rutgers.edu"
+    ],
+    "rank": 1453
+  },
+  {
+    "url": "https://openreview.net/forum?id=OAdGsaptOXy",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.36",
+    "confidences": [
+      3,
+      4,
+      2,
+      2
+    ],
+    "abstract": "How much knowledge do pretrained language models hold? Recent research observed that pretrained transformers are adept at modeling semantics but it is unclear to what degree they grasp human knowledge, or how to ensure they do so. In this paper we incorporate knowledge-awareness in language model pretraining without changing the transformer architecture, inserting explicit knowledge layers, or adding external storage of semantic information. Rather, we simply signal the existence of entities to the input of the transformer in pretraining, with an entity-extended tokenizer; and at the output, with an additional entity prediction task. Our experiments show that solely by adding these entity signals in pretraining, significantly more knowledge is packed into the transformer parameters: we observe improved language modeling accuracy, factual correctness in LAMA knowledge probing tasks, and semantics in the hidden representations through edge probing. We also show that our knowledge-aware language model (\\kalm{}) can serve as a drop-in replacement for GPT-2 models, significantly improving downstream tasks like zero-shot question-answering with no task-related training.  ",
+    "title": "Pretrain Knowledge-Aware Language Models",
+    "authors": [
+      "Corbin L Rosset",
+      "Chenyan Xiong",
+      "Minh Phan",
+      "Xia Song",
+      "Paul N. Bennett",
+      "saurabh tiwary"
+    ],
+    "emails": [
+      "~Xia_Song1",
+      "~Chenyan_Xiong1",
+      "microsoft.com",
+      "~saurabh_tiwary1",
+      "~Corbin_L_Rosset1",
+      "~Paul_N._Bennett1"
+    ],
+    "rank": 1454
+  },
+  {
+    "url": "https://openreview.net/forum?id=bi7nTZy4QmH",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      3,
+      2,
+      5
+    ],
+    "abstract": "Existing methods for training robust neural networks generally aim to make models uniformly robust on all input dimensions. However, different input dimensions are not uniformly important to the prediction. In this paper, we propose a novel framework to train certifiably robust models and learn non-uniform perturbation budgets on different input dimensions, in contrast to using the popular <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> threat model. We incorporate a perturbation budget generator into the existing certified defense framework, and perform certified training with generated perturbation budgets. In comparison to the radius of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> ball in previous works, the robustness intensity is measured by robustness volume which is the multiplication of perturbation budgets on all input dimensions. We evaluate our method on MNIST and CIFAR-10 datasets and show that we can achieve lower clean and certified errors on relatively larger robustness volumes, compared to methods using uniform perturbation budgets. Further with two synthetic datasets constructed from MNIST and CIFAR-10, we also demonstrate that the perturbation budget generator can produce semantically-meaningful budgets, which implies that the generator can capture contextual information and the sensitivity of different features in input images.",
+    "title": "Learning Contextual Perturbation Budgets for Training Robust Neural Networks",
+    "authors": [
+      "Jing Xu",
+      "Zhouxing Shi",
+      "Huan Zhang",
+      "Jinfeng Yi",
+      "Cho-Jui Hsieh",
+      "Liwei Wang"
+    ],
+    "emails": [
+      "~Cho-Jui_Hsieh1",
+      "~Jinfeng_Yi1",
+      "~Zhouxing_Shi1",
+      "~Jing_Xu4",
+      "~Liwei_Wang1",
+      "~Huan_Zhang1"
+    ],
+    "rank": 1455
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qpik5XBv_1-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      10,
+      2
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "How to best integrate linguistic and perceptual processing in multimodal tasks is an important open problem. In this work we argue that the common technique of using language to direct visual attention over high-level visual features may not be optimal. Using language throughout the bottom-up visual pathway, going from pixels to high-level features, may be necessary. Our experiments on several English referring expression datasets show significant improvements when language is used to control the filters for bottom-up visual processing in addition to top-down attention.",
+    "title": "Language Controls More Than Top-Down Attention: Modulating Bottom-Up Visual Processing with Referring Expressions",
+    "authors": [
+      "Ozan Arkan Can",
+      "Ilker Kesen",
+      "Deniz Yuret"
+    ],
+    "emails": [
+      "~Ilker_Kesen1",
+      "~Ozan_Arkan_Can1",
+      "~Deniz_Yuret1"
+    ],
+    "rank": 1456
+  },
+  {
+    "url": "https://openreview.net/forum?id=ePh9bvqIgKL",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "Recent studies have shown that the choice of activation function can significantly affect the performance of deep learning networks. However, the benefits of novel activation functions have been inconsistent and task-dependent, and therefore the rectified linear unit (ReLU) is still the most commonly used. This paper proposes a technique for customizing activation functions automatically, resulting in reliable improvements in performance. Evolutionary search is used to discover the general form of the function, and gradient descent to optimize its parameters for different parts of the network and over the learning process. Experiments with three different neural network architectures on the CIFAR-100 image classification dataset show that this approach is effective. It discovers different activation functions for different architectures, and consistently improves accuracy over ReLU and other recently proposed activation functions by significant margins. The approach can therefore be used as an automated optimization step in applying deep learning to new tasks.",
+    "title": "Discovering Parametric Activation Functions",
+    "authors": [
+      "Garrett Bingham",
+      "Risto Miikkulainen"
+    ],
+    "emails": [
+      "~Risto_Miikkulainen1",
+      "~Garrett_Bingham1"
+    ],
+    "rank": 1457
+  },
+  {
+    "url": "https://openreview.net/forum?id=OqtLIabPTit",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "Existing self-supervised learning (SSL) methods are mostly applied for training representation models from artificially balanced datasets (e.g., ImageNet). It is unclear how well they will perform in the practical scenarios where datasets are often imbalanced w.r.t. the classes. Motivated by this question, we conduct a series of studies on the performance of self-supervised contrastive learning and supervised learning methods over multiple datasets where training instance distributions vary from a balanced one to a long-tailed one. Our findings are quite intriguing. Different from supervised methods with large performance drop, the self-supervised contrastive learning methods perform stably well even when the datasets are heavily imbalanced. This motivates us to explore the balanced feature spaces learned by contrastive learning, where the feature representations present similar linear separability w.r.t. all the classes. Our further experiments reveal that a representation model generating a balanced feature space can generalize better than that yielding an imbalanced one across multiple settings. Inspired by these insights, we develop a novel representation learning method, called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-positive contrastive learning. It effectively combines strengths of the supervised method and the contrastive learning method to learn representations that are both discriminative and balanced. Extensive experiments demonstrate its superiority on multiple recognition tasks. Remarkably, it achieves new state-of-the-art on challenging long-tailed recognition benchmarks. Code and models will be released.",
+    "title": "Exploring Balanced Feature Spaces for Representation Learning",
+    "authors": [
+      "Bingyi Kang",
+      "Yu Li",
+      "Sa Xie",
+      "Zehuan Yuan",
+      "Jiashi Feng"
+    ],
+    "emails": [
+      "~Bingyi_Kang1",
+      "~Jiashi_Feng1",
+      "~Yu_Li7",
+      "~Sa_Xie1",
+      "~Zehuan_Yuan1"
+    ],
+    "rank": 1458
+  },
+  {
+    "url": "https://openreview.net/forum?id=yrDEUYauOMd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Fairness of machine learning algorithms has been of increasing interest. In order to suppress or eliminate discrimination in prediction, various notions as well as approaches to impose fairness have been proposed. However, in different scenarios, whether or not the chosen notion of fairness can always be attained, even if with unlimited amount of data, is not well addressed. In this paper, focusing on the Equalized Odds notion of fairness, we consider the attainability of this criterion, and furthermore, if attainable, the optimality of the prediction performance under various settings. In particular, for classification with a deterministic prediction function of the input, we give the condition under which Equalized Odds can hold true; if randomized prediction is acceptable, we show that under mild assumptions, fair classifiers can always be derived. Moreover, we prove that compared to enforcing fairness by post-processing, one can always benefit from exploiting all available features during training and get better prediction performance while remaining fair. However, for regression tasks, Equalized Odds is not always attainable if certain conditions on the joint distribution of the features and the target variable are not met. This indicates the inherent difficulty in achieving fairness in certain cases and suggests a broader class of prediction methods might be needed for fairness.",
+    "title": "Attainability and Optimality: The Equalized-Odds Fairness Revisited",
+    "authors": [
+      "Zeyu Tang",
+      "Kun Zhang"
+    ],
+    "emails": [
+      "~Zeyu_Tang1",
+      "~Kun_Zhang1"
+    ],
+    "rank": 1459
+  },
+  {
+    "url": "https://openreview.net/forum?id=imnG4Ap9dAd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "News-driven stock prediction investigates the correlation between news events and stock price movements.\nPrevious work has considered effective ways for representing news events and their sequences, but rarely exploited the representation of underlying equity states.\nWe address this issue by making use of a recurrent neural network to represent an equity state transition sequence, integrating news representation using contextualized embeddings as inputs to the state transition mechanism.\nThanks to the separation of news and equity representations, our model can accommodate additional input factors.\nWe design a novel random noise factor for modeling influencing factors beyond news events, and a future event factor to address the delay of news information (e.g., insider trading).\nResults show that the proposed model outperforms strong baselines in the literature.",
+    "title": "News-Driven Stock Prediction Using Noisy Equity State Representation",
+    "authors": [
+      "Xiao Liu",
+      "Heyan Huang",
+      "Yue Zhang"
+    ],
+    "emails": [
+      "wias.org.cn",
+      "bit.edu.cn",
+      "~Xiao_Liu14"
+    ],
+    "rank": 1460
+  },
+  {
+    "url": "https://openreview.net/forum?id=Siwm2BaNiG",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many important problems in the real world don't have unique solutions. It is thus important for machine learning models to be capable of proposing different plausible solutions with meaningful probability measures.\nIn this work we propose a novel deep learning based framework, named {\\it modal uncertainty estimation}  (MUE), to learn the one-to-many mappings between the inputs and outputs, together with faithful uncertainty estimation.\nMotivated by the multi-modal posterior collapse problem in current conditional generative models, MUE uses a set of discrete latent variables, each representing a latent mode hypothesis that explains one type of input-output relationship, to generate the one-to-many mappings. Benefit from the discrete nature of the latent representations, MUE can estimate any input the conditional probability distribution of the outputs effectively. Moreover, MUE is efficient during training since the discrete latent space and its uncertainty estimation are jointly learned.\nWe also develop the theoretical background of MUE and extensively validate it on both synthetic and realistic tasks. MUE demonstrates (1) significantly more accurate uncertainty estimation than the current state-of-the-art, and (2) its informativeness for practical use.\n\n",
+    "title": "Modal Uncertainty Estimation via Discrete Latent Representations",
+    "authors": [
+      "Di Qiu",
+      "Zhanghan Ke",
+      "Peng Su",
+      "Lok Ming Lui"
+    ],
+    "emails": [
+      "~Lok_Ming_Lui2",
+      "~Zhanghan_Ke1",
+      "~Peng_Su1",
+      "~Di_Qiu1"
+    ],
+    "rank": 1461
+  },
+  {
+    "url": "https://openreview.net/forum?id=2hT6Fbbwh6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "For many domains, from video stream analytics to human activity recognition, only weakly-labeled datasets are available.\nWorse yet, the given labels are often assigned sequentially, resulting in sequential bias. Current Positive Unlabeled (PU) classifiers, a state-of-the-art family of robust semi-supervised methods, are ineffective under sequential bias. In this work, we  propose DeepSPU, the first method to address this sequential bias problem. DeepSPU tackles the two interdependent subproblems of learning both the latent labeling process and the true class likelihoods within one architecture. We achieve this by developing a novel iterative learning strategy aided by theoretically-justified cost terms to avoid collapsing into a naive classifier. Our experimental studies demonstrate that DeepSPU outperforms state-of-the-art methods by over 10% on diverse real-world datasets.",
+    "title": "Deep Positive Unlabeled Learning with a Sequential Bias",
+    "authors": [
+      "Walter Gerych",
+      "Thomas Hartvigsen",
+      "Luke Buquicchio",
+      "Kavin Chandrasekaran",
+      "Hamid Mansoor",
+      "Abdulaziz alajaji"
+    ],
+    "emails": [
+      "~Thomas_Hartvigsen1",
+      "~Walter_Gerych2",
+      "wpi.edu"
+    ],
+    "rank": 1462
+  },
+  {
+    "url": "https://openreview.net/forum?id=awMgJJ9H-0q",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose an Euler particle transport  (EPT) approach for generative learning.  The proposed approach is motivated by the problem of finding the optimal transport map from a reference distribution to a target distribution characterized by the Monge-Ampere equation. Interpreting the infinitesimal linearization of the Monge-Ampere equation from the perspective of gradient flows in measure spaces leads to a stochastic McKean-Vlasov equation. We use the forward Euler method to solve this equation. The resulting forward Euler map pushes forward a reference distribution to the target. This map is the composition of a sequence of simple residual maps, which are computationally stable and easy to train. The key task in training is the estimation of the density ratios or differences that determine the residual maps. We estimate the density ratios (differences) based on the Bregman divergence with a gradient penalty using deep density-ratio (difference) fitting. We show that the proposed density-ratio (difference) estimators do not suffer from the ``curse of dimensionality\" if data is supported on a lower-dimensional manifold. Numerical experiments with multi-mode synthetic datasets and comparisons with the existing methods on real benchmark datasets support our theoretical results and demonstrate the effectiveness of the proposed method.",
+    "title": "Generative Learning With Euler Particle Transport",
+    "authors": [
+      "Yuan Gao",
+      "Jian Huang",
+      "Yuling Jiao",
+      "Jin Liu"
+    ],
+    "emails": [
+      "~Jin_Liu5",
+      "gmail.com",
+      "whu.edu.cn",
+      "~Jian_Huang5"
+    ],
+    "rank": 1463
+  },
+  {
+    "url": "https://openreview.net/forum?id=4mkxyuPcFt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Using generative models (GAN or VAE) to craft adversarial examples, i.e. generative adversarial examples, has received increasing attention in recent years. Previous studies showed that the generative adversarial examples work differently compared to that of the regular adversarial examples in many aspects, such as attack rates, perceptibility, and generalization. But the reasons causing the differences between regular and generative adversarial examples are unclear. In this work, we study the theoretical properties of the attacking mechanisms of the two kinds of adversarial examples in the Gaussian mixture data model case. We prove that adversarial robustness can be disentangled in directions of the data manifold. Specifically, we find that: 1. Regular adversarial examples attack in directions of small variance of the data manifold, while generative adversarial examples attack in directions of large variance. 2. Standard adversarial training increases model robustness by extending the data manifold boundary in directions of small variance, while on the contrary, adversarial training with generative adversarial examples increases model robustness by extending the data manifold boundary directions of large variance. In experiments, we demonstrate that these phenomena also exist on real datasets. Finally, we study the robustness trade-off between generative and regular adversarial examples. We show that the conflict between regular and generative adversarial examples is much smaller than the conflict between regular adversarial examples of different norms.",
+    "title": "Disentangling Adversarial Robustness in Directions of the Data Manifold",
+    "authors": [
+      "Jiancong Xiao",
+      "Liusha Yang",
+      "Zhi-Quan Luo"
+    ],
+    "emails": [
+      "~Zhi-Quan_Luo1",
+      "~Liusha_Yang1",
+      "~Jiancong_Xiao1"
+    ],
+    "rank": 1464
+  },
+  {
+    "url": "https://openreview.net/forum?id=l3YcqzaPlx0",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Modern Graph Neural Networks (GNNs) follow a recursive neighbor-wise message passing scheme and have achieved great success in many fields. However, this recursive design brings expensive computation and huge memory usage, making it difficult to deploy on large-scale graphs. In this work, we propose Neighbor2Seq, which transforms the hierarchical neighborhood of each node into an ordered sequence and enables the subsequent utilization of general deep learning operations, such as convolution and attention. Neighbor2Seq grants our proposed models, i.e., Neighbor2Seq-Conv and Neighbor2Seq-Attn, the ability to learn on arbitrarily large graphs as long as the Neighbor2Seq step can be precomputed. Another potential advantage obtained by the way is that Neighbor2Seq can alleviate the over-squashing issue existing in modern GNNs. We conduct thorough experiments on a massive graph with more than 111 million nodes and 1.6 billion edges, as well as several medium-scale graphs, to evaluate our proposed method. Experimental results demonstrate that our proposed method is scalable to the massive graph and achieves superior performance across datasets.",
+    "title": "Neighbor2Seq: Deep Learning on Massive Graphs by Transforming Neighbors to Sequences",
+    "authors": [
+      "Meng Liu",
+      "Shuiwang Ji"
+    ],
+    "emails": [
+      "~Meng_Liu3",
+      "~Shuiwang_Ji1"
+    ],
+    "rank": 1465
+  },
+  {
+    "url": "https://openreview.net/forum?id=49mMdsxkPlD",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Policy networks are a central feature of deep reinforcement learning (RL) algorithms for continuous control, enabling the estimation and sampling of high-value actions. From the variational inference perspective on RL, policy networks, when employed with entropy or KL regularization, are a form of amortized optimization, optimizing network parameters rather than the policy distributions directly. However, this direct amortized mapping can empirically yield suboptimal policy estimates and limited exploration. Given this perspective, we consider the more flexible class of iterative amortized optimizers. We demonstrate that the resulting technique, iterative amortized policy optimization, yields performance improvements over direct amortization methods on benchmark continuous control tasks.",
+    "title": "Iterative Amortized Policy Optimization",
+    "authors": [
+      "Joseph Marino",
+      "Alexandre Pich\u00e9",
+      "Alessandro Davide Ialongo",
+      "Yisong Yue"
+    ],
+    "emails": [
+      "~Alexandre_Pich\u00e91",
+      "~Alessandro_Davide_Ialongo1",
+      "~Yisong_Yue1",
+      "~Joseph_Marino1"
+    ],
+    "rank": 1466
+  },
+  {
+    "url": "https://openreview.net/forum?id=tFPAIXpb13",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Convolution learning on graphs draws increasing attention recently due to its potential applications to a large amount of irregular data. Most graph convolution methods leverage the plain summation/average aggregation to avoid the discrepancy of responses from isomorphic graphs. However, such an extreme collapsing way would result in a structural loss and signal entanglement of nodes, which further cause the degradation of the learning ability. In this paper, we propose a simple yet effective graph deformer network (GDN) to fulfill anisotropic convolution filtering on graphs,  analogous to the standard convolution operation on images. Local neighborhood subgraphs (acting like receptive fields) with different structures are deformed into a unified virtual space, coordinated by several anchor nodes. In space deformation, we transfer components of nodes therein into anchors by learning their correlation, and build a pseudo multi-granularity plane calibrated with anchors. Anisotropic convolutional kernels can be further performed over the anchor-coordinated space to well encode local variations of receptive fields. By parameterizing anchors and stacking coarsening layers, we build a graph deformer network in an end-to-end fashion. Theoretical analysis indicates its connection to previous work and shows the promising property of isomorphism testing. Extensive experiments on widely-used datasets validate the effectiveness of the proposed GDN in node and graph classifications.",
+    "title": "Graph Deformer Network",
+    "authors": [
+      "Wenting Zhao",
+      "Yuan Fang",
+      "Zhen Cui",
+      "Tong Zhang",
+      "Jian Yang",
+      "Wei Liu"
+    ],
+    "emails": [
+      "~Jian_Yang1",
+      "njust.edu.cn",
+      "~Wei_Liu3",
+      "~Tong_Zhang8",
+      "~Zhen_Cui4",
+      "~Wenting_Zhao2"
+    ],
+    "rank": 1467
+  },
+  {
+    "url": "https://openreview.net/forum?id=QzKDLiosEd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We  examine  the  magnetic  flux  emanating  from  a  graphics  processing  unit\u2019s (GPU\u2019s) power cable, as acquired by a cheap $3 induction sensor, and find that this signal betrays the detailed topology and hyperparameters of a black-box neural network model.  The attack acquires the magnetic signal for one query with unknown input values, but known input dimension and batch size. The reconstruction is possible due to the modular layer sequence in which deep neural networks are evaluated. We find that each layer component\u2019s evaluation produces an identifiable magnetic signal signature, from which layer topology, width, function type, and sequence order can be inferred using a suitably trained classifier and an optimization based on integer programming.  We study the extent to which network specifications can be recovered, and consider metrics for comparing network similarity. We demonstrate the potential accuracy of this side channel attack in recovering the details for a broad range of network architectures including also random designs.  We consider applications that may exploit this novel side channel exposure, such as adversarial transfer attacks. In response, we discuss countermeasures to protect against our method and other similar snooping techniques.",
+    "title": "Can one hear the shape of a neural network?: Snooping the GPU via Magnetic Side Channel",
+    "authors": [
+      "Henrique Teles Maia",
+      "Chang Xiao",
+      "Dingzeyu Li",
+      "Eitan Grinspun",
+      "Changxi Zheng"
+    ],
+    "emails": [
+      "~Chang_Xiao1",
+      "~Dingzeyu_Li2",
+      "~Changxi_Zheng1",
+      "~Henrique_Teles_Maia1",
+      "~Eitan_Grinspun3"
+    ],
+    "rank": 1468
+  },
+  {
+    "url": "https://openreview.net/forum?id=cjk5mri_aOm",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": " We introduce environment predictive coding, a self-supervised approach to learn environment-level representations for embodied agents. In contrast to prior work on self-supervised learning for images, we aim to jointly encode a series of images gathered by an agent as it moves about in 3D environments. We learn these representations via a zone prediction task, where we intelligently mask out portions of an agent's trajectory and predict them from the unmasked portions, conditioned on the agent's camera poses. By learning such representations on a collection of videos, we demonstrate successful transfer to multiple downstream navigation-oriented tasks. Our experiments on the photorealistic 3D environments of Gibson and Matterport3D show that our method outperforms the state-of-the-art on challenging tasks with only a limited budget of experience.",
+    "title": "Environment Predictive Coding for Embodied Agents",
+    "authors": [
+      "Santhosh Kumar Ramakrishnan",
+      "Tushar Nagarajan",
+      "Ziad Al-Halah",
+      "Kristen Grauman"
+    ],
+    "emails": [
+      "~Tushar_Nagarajan1",
+      "~Santhosh_Kumar_Ramakrishnan1",
+      "~Ziad_Al-Halah2",
+      "~Kristen_Grauman1"
+    ],
+    "rank": 1469
+  },
+  {
+    "url": "https://openreview.net/forum?id=cvNYovr16SB",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "We introduce a new unsupervised pre-training method for reinforcement learning called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D40F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D413 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">APT</mtext></math></mjx-assistive-mml></mjx-container>, which stands for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D400 TEX-B\"></mjx-c></mjx-mtext><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-c63\"></mjx-c><mjx-c class=\"mjx-c74\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c76\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c></mjx-mtext><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D40F TEX-B\"></mjx-c></mjx-mtext><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-c72\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c><mjx-c class=\"mjx-c2D\"></mjx-c></mjx-mtext><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D413 TEX-B\"></mjx-c></mjx-mtext><mjx-mtext class=\"mjx-n\"><mjx-c class=\"mjx-c72\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">A</mtext><mtext>ctive</mtext><mtext mathvariant=\"bold\">P</mtext><mtext>re-</mtext><mtext mathvariant=\"bold\">T</mtext><mtext>raining</mtext></math></mjx-assistive-mml></mjx-container>. APT learns a representation and a policy initialization by actively searching for novel states in reward-free environments. We use the contrastive learning framework for learning the representation from collected transitions. The key novel idea is to collect data during pre-training by maximizing a particle based entropy computed in the learned latent representation space. By doing particle based entropy maximization, we alleviate the need for challenging density modeling and are thus able to scale our approach to image observations. APT successfully learns meaningful representations as well as policy initializations without using any reward. We empirically evaluate APT on the Atari game suite and DMControl suite by exposing task-specific reward to agent after a long unsupervised pre-training phase. On Atari games, APT achieves human-level performance on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>12</mn></math></mjx-assistive-mml></mjx-container> games and obtains highly competitive performance compared to canonical fully supervised RL algorithms. On DMControl suite, APT beats all baselines in terms of asymptotic performance and data efficiency and dramatically improves performance on tasks that are extremely difficult for training from scratch. Importantly, the pre-trained models can be fine-tuned to solve different tasks as long as the environment does not change. Finally, we also pre-train multi-environment encoders on data from multiple environments and show generalization to a broad set of RL tasks.",
+    "title": "Unsupervised Active Pre-Training for Reinforcement Learning",
+    "authors": [
+      "Hao Liu",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Pieter_Abbeel2",
+      "~Hao_Liu1"
+    ],
+    "rank": 1470
+  },
+  {
+    "url": "https://openreview.net/forum?id=XG1Drw7VbLJ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In recent years there has been substantial progress in few-shot learning, where a model is trained on a small labeled dataset related to a specific task, and in continual learning, where a model has to retain knowledge acquired on a sequence of datasets. Both of these fields are different abstractions of the same real world scenario, where a learner has to adapt to limited information from different changing sources and be able to generalize in and from each of them. Combining these two paradigms, where a model is trained on several sequential few-shot tasks, and then tested on a validation set stemming from all those tasks, helps by explicitly defining the competing requirements for both efficient integration and continuity. In this paper we propose such a setting, naming it Continual Few-Shot Learning (CFSL). We first define a theoretical framework for CFSL, then we propose a range of flexible benchmarks to unify the evaluation criteria. As part of the benchmark, we introduce a compact variant of ImageNet, called SlimageNet64, which retains all original 1000 classes but only contains 200 instances of each one (a total of 200K data-points) downscaled to 64 by 64 pixels. We provide baselines for the proposed benchmarks using a number of popular few-shot and continual learning methods, exposing previously unknown strengths and weaknesses of those algorithms. The dataloader and dataset will be released with an open-source license.",
+    "title": "Defining Benchmarks for Continual Few-Shot Learning",
+    "authors": [
+      "Antreas Antoniou",
+      "Massimiliano Patacchiola",
+      "Mateusz Ochal",
+      "Amos Storkey"
+    ],
+    "emails": [
+      "~Amos_Storkey1",
+      "~Massimiliano_Patacchiola1",
+      "~Mateusz_Ochal1",
+      "~Antreas_Antoniou2"
+    ],
+    "rank": 1471
+  },
+  {
+    "url": "https://openreview.net/forum?id=54-QTuqSLyn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Generative Adversarial Networks (GANs) are a class of generative models used for various applications, but they have been known to suffer from the mode collapse problem, in which some modes of the target distribution are ignored by the generator.  Investigative study using a new data generation procedure indicates that the mode collapse of the generator is driven by the discriminator\u2019s inability to maintain classification accuracy on previously seen samples, a phenomenon called Catastrophic Forgetting in continual learning. Motivated by this observation, we introduce a novel training procedure that dynamically spawns additional dis-criminators to remember previous modes of generation. On several datasets, we show that our training scheme can be plugged-in to existing GAN frameworks to mitigate mode collapse and improve standard metrics for GAN evaluation.",
+    "title": "Mitigating Mode Collapse by Sidestepping Catastrophic Forgetting",
+    "authors": [
+      "Karttikeya Mangalam",
+      "Rohin Garg",
+      "Jathushan Rajasegaran",
+      "Taesung Park"
+    ],
+    "emails": [
+      "~Rohin_Garg1",
+      "~Karttikeya_Mangalam1",
+      "~Taesung_Park2",
+      "~Jathushan_Rajasegaran1"
+    ],
+    "rank": 1472
+  },
+  {
+    "url": "https://openreview.net/forum?id=_L6b4Qzn5bp",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Coronavirus disease 2019 (COVID-19), the pandemic that is spreading fast globally, has caused over 34 million confirmed cases. Apart from the reverse transcription polymerase chain reaction (RT-PCR), the chest computed tomography (CT) is viewed as a standard and effective tool for disease diagnosis and progression monitoring. We propose a diagnosis and prognosis model based on graph convolutional networks (GCNs). The chest CT scan of a patient, typically involving hundreds of sectional images in sequential order, is formulated as a densely connected weighted graph. A novel distance aware pooling is proposed to abstract the node information hierarchically, which is robust and efficient for such densely connected graphs. Our method, combining GCNs and distance aware pooling, can integrate the information from all slices in the chest CT scans for optimal decision making, which leads to the state-of-the-art accuracy in the COVID-19 diagnosis and prognosis. With less than 1% number of total parameters in the baseline 3D ResNet model, our method achieves 94.8% accuracy for diagnosis. It has a 2.4% improvement compared with the baseline model on the same dataset. In addition, we can localize the most informative slices with disease lesions for COVID-19 within a large sequence of chest CT images. The proposed model can produce visual explanations for the diagnosis and prognosis, making the decision more transparent and explainable, while RT-PCR only leads to the test result with no prognosis information. The prognosis analysis can help hospitals or clinical centers designate medical resources more efficiently and better support clinicians to determine the proper clinical treatment. ",
+    "title": "Beyond COVID-19 Diagnosis: Prognosis with Hierarchical Graph Representation Learning",
+    "authors": [
+      "CHEN LIU",
+      "Jinze Cui",
+      "Dailin Gan",
+      "Guosheng Yin"
+    ],
+    "emails": [
+      "~Guosheng_Yin1",
+      "~Jinze_Cui2",
+      "~CHEN_LIU7",
+      "~Dailin_Gan1"
+    ],
+    "rank": 1473
+  },
+  {
+    "url": "https://openreview.net/forum?id=XoF2fGAvXO6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Neural Module Networks (NMNs) have been quite successful in incorporating explicit reasoning as learnable modules in various question answering tasks, including the most generic form of numerical reasoning over text  in Machine Reading Comprehension (MRC). However, to achieve this, contemporary NMNs need strong supervision in executing the query as a specialized program over the reasoning modules and fail to generalize to more open-ended settings where such supervision is not readily available. In this work, we propose Weakly Supervised Neuro-Symbolic Module Network (WNSMN) trained with answers as the sole supervision for numerical reasoning based MRC. It learns to execute a noisy heuristic program obtained from dependency parsing of the query as discrete actions over both neural and symbolic reasoning modules and trains it end-to-end in a reinforcement learning framework with discrete reward from answer matching. On the numerical-answer subset of the DROP dataset, WNSMN outperforms NMN by 32% and the reasoning-free language model GenBERT by 8% in exact match accuracy when trained under comparable weak supervised settings. This showcases the effectiveness and generalizability of modular networks that can handle explicit discrete reasoning over noisy programs in an end-to-end manner. ",
+    "title": "Weakly Supervised Neuro-Symbolic Module Networks for Numerical Reasoning",
+    "authors": [
+      "Amrita Saha",
+      "Shafiq Joty",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Shafiq_Joty1",
+      "~Amrita_Saha1",
+      "~Steven_Hoi2"
+    ],
+    "rank": 1474
+  },
+  {
+    "url": "https://openreview.net/forum?id=VNJUTmR-CaZ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper presents a new graph neural network architecture over which reinforcement learning can be performed to yield online policies for an important class of multi-robot task allocation (MRTA) problems, one that involves tasks with deadlines, and robots with ferry range and payload constraints and multi-tour capability. While drawing motivation from recent graph learning methods that learn to solve combinatorial optimization problems of the mTSP/VRP type, this paper seeks to provide better convergence and generalizability specifically for MRTA problems. The proposed neural architecture, called Covariant Attention-based Model or CAM, includes three main components: 1) an encoder: a covariant compositional node-based embedding is used to represent each task as a learnable feature vector in manner that preserves the local structure of the task graph while being invariant to the ordering of graph nodes; 2) context: a vector representation of the mission time and state of the concerned robot and its peers; and 2) a decoder: builds upon the attention mechanism to facilitate a sequential output. In order to train the CAM model, a policy-gradient method based on REINFORCE is used. While the new architecture can solve the broad class of MRTA problems stated above, to demonstrate real-world applicability we use a multi-unmanned aerial vehicle or multi-UAV-based flood response problem for evaluation purposes. For comparison, the well-known attention-based approach (designed to solve mTSP/VRP problems) is extended and applied to the MRTA problem, as a baseline. The results show that the proposed CAM method is not only superior to the baseline AM method in terms of the cost function (over training and unseen test scenarios), but also provide significantly faster convergence and yields learnt policies that can be executed within 2.4ms/robot, thereby allowing real-time application.",
+    "title": "Learning to Solve Multi-Robot Task Allocation with a Covariant-Attention based Neural Architecture",
+    "authors": [
+      "Steve Paul",
+      "Payam Ghassemi",
+      "Souma Chowdhury"
+    ],
+    "emails": [
+      "~Steve_Paul1",
+      "~Souma_Chowdhury1",
+      "~Payam_Ghassemi2"
+    ],
+    "rank": 1475
+  },
+  {
+    "url": "https://openreview.net/forum?id=H8UHdhWG6A3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Byzantine-resilient Stochastic Gradient Descent (SGD) aims at shielding model training from Byzantine faults, be they ill-labeled training datapoints, exploited software/hardware vulnerabilities, or malicious worker nodes in a distributed setting.\nTwo recent attacks have been challenging state-of-the-art defenses though, often successfully precluding the model from even fitting the training set.\nThe main identified weakness in current defenses is their requirement of a sufficiently low variance-norm ratio for the stochastic gradients.\nWe propose a practical method which, despite increasing the variance, reduces the variance-norm ratio, mitigating the identified weakness.\nWe assess the effectiveness of our method over 736 different training configurations, comprising the 2 state-of-the-art attacks and 6 defenses.\nFor confidence and reproducibility purposes, each configuration is run 5 times with specified seeds (1 to 5), totalling 3680 runs.\nIn our experiments, when the attack is effective enough to decrease the highest observed top-1 cross-accuracy by at least 20% compared to the unattacked run, our technique systematically increases back the highest observed accuracy, and is able to recover at least 20% in more than 60% of the cases.",
+    "title": "Distributed Momentum for Byzantine-resilient Stochastic Gradient Descent",
+    "authors": [
+      "El Mahdi El Mhamdi",
+      "Rachid Guerraoui",
+      "S\u00e9bastien Rouault"
+    ],
+    "emails": [
+      "polytechnique.edu",
+      "~Rachid_Guerraoui1",
+      "~S\u00e9bastien_Rouault1"
+    ],
+    "rank": 1476
+  },
+  {
+    "url": "https://openreview.net/forum?id=uUX49ez8P06",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Continual learning with neural networks is an important learning framework in AI that aims to learn a sequence of tasks well. However, it is often confronted with three challenges: (1) overcome the catastrophic forgetting problem, (2) adapt the current network to new tasks, and meanwhile (3) control its model complexity. To reach these goals, we propose a novel approach named as Continual Learning with Efficient Architecture Search, or CLEAS in short. CLEAS works closely with neural architecture search (NAS) which leverages reinforcement learning techniques to search for the best neural architecture that fits a new task. In particular, we design a neuron-level NAS controller that decides which old neurons from previous tasks should be reused (knowledge transfer), and which new neurons should be added (to learn new knowledge). Such a fine-grained controller allows finding a very concise architecture that can fit each new task well. Meanwhile, since we do not alter the weights of the reused neurons, we perfectly memorize the knowledge learned from previous tasks. We evaluate CLEAS on numerous sequential classification tasks, and the results demonstrate that CLEAS outperforms other state-of-the-art alternative methods, achieving higher classification accuracy while using simpler neural architectures.\n",
+    "title": "Efficient Architecture Search for Continual Learning",
+    "authors": [
+      "Qiang Gao",
+      "Zhipeng Luo",
+      "Diego Klabjan",
+      "Fengli Zhang"
+    ],
+    "emails": [
+      "northwestern.edu",
+      "uestc.edu.cn",
+      "~Qiang_Gao1",
+      "~Diego_Klabjan1"
+    ],
+    "rank": 1477
+  },
+  {
+    "url": "https://openreview.net/forum?id=HMqNjkBEqP4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Estimating the 3D shape of real-world objects is a key perceptual challenge. It requires going from partial observations, which are often too sparse and incomprehensible for the human eye, to detailed shape representations that vary significantly across categories and instances. We propose to cast shape completion as a Bayesian meta-learning problem to facilitate the transfer of knowledge learned from observing one object into estimating the shape of another object. To combine the Bayesian framework with an approach that uses implicit 3D object representation, we introduce an encoder that describes the posterior distribution of a latent representation conditioned on sparse point clouds. With its ability to isolate object-specific properties from object-agnostic properties, our meta-learning algorithm enables accurate shape completion of newly-encountered objects from sparse observations. We demonstrate the efficacy of our proposed method with experimental results on the standard ShapeNet and ICL-NUIM benchmarks. ",
+    "title": "Bayesian Meta-Learning for Few-Shot 3D Shape Completion ",
+    "authors": [
+      "Masanori Koyama",
+      "Toshiki Nakanishi",
+      "Shin-ichi Maeda",
+      "Vitor Campagnolo Guizilini",
+      "Adrien Gaidon"
+    ],
+    "emails": [
+      "~Adrien_Gaidon1",
+      "~Toshiki_Nakanishi1",
+      "~Vitor_Campagnolo_Guizilini2",
+      "~Masanori_Koyama1",
+      "~Shin-ichi_Maeda2"
+    ],
+    "rank": 1478
+  },
+  {
+    "url": "https://openreview.net/forum?id=bkincnjT8zx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      5,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We introduce Neural Dynamical Systems (NDS), a method of learning dynamical models in various gray-box settings which incorporates prior knowledge in the form of systems of ordinary differential equations. NDS uses neural networks to estimate free parameters of the system,  predicts residual terms,  and numerically integrates over time to predict future states.  A key insight is that many real dynamical systems of interest are hard to model because the dynamics may vary across rollouts.  We mitigate this problem by taking a trajectory of prior states as the input to NDS and train it to dynamically estimate system parameters using the preceding trajectory. We find that NDS learns dynamics with higher accuracy and fewer samples than a variety of deep learning methods that do not incorporate the prior knowledge and methods from the system identification literature which do.  We demonstrate these advantages first on synthetic dynamical systems and then on real data captured from deuterium shots from a nuclear fusion reactor. Finally, we demonstrate that these benefits can be utilized for control in small-scale experiments.",
+    "title": "Neural Dynamical Systems: Balancing Structure and Flexibility in Physical Prediction",
+    "authors": [
+      "Viraj Mehta",
+      "Ian Char",
+      "Willie Neiswanger",
+      "Youngseog Chung",
+      "Andrew Oakleigh Nelson",
+      "Mark D Boyer",
+      "Egemen Kolemen",
+      "Jeff Schneider"
+    ],
+    "emails": [
+      "~Willie_Neiswanger1",
+      "pppl.gov",
+      "cs.cmu.edu",
+      "~Viraj_Mehta1",
+      "~Jeff_Schneider1"
+    ],
+    "rank": 1479
+  },
+  {
+    "url": "https://openreview.net/forum?id=f_GA2IU9-K-",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Although distributional reinforcement learning (DRL) has been widely examined in the past few years, there are two open questions people are still trying to address. One is how to ensure the validity of the learned quantile function, the other is how to efficiently utilize the distribution information. This paper attempts to provide some new perspectives to encourage the future in-depth studies in these two fields. We first propose a non-decreasing quantile function network (NDQFN) to guarantee the monotonicity of the obtained quantile estimates and then design a general exploration framework called distributional prediction error (DPE) for DRL which utilizes the entire distribution of the quantile function. In this paper, we not only discuss the theoretical necessity of our method but also show the performance gain it achieves in practice by comparing with some competitors on Atari 2600 Games especially in some hard-explored games.",
+    "title": "Non-decreasing Quantile Function Network with Efficient Exploration for Distributional Reinforcement Learning",
+    "authors": [
+      "Fan Zhou",
+      "Zhoufan Zhu",
+      "Qi Kuang",
+      "Liwen Zhang"
+    ],
+    "emails": [
+      "~Fan_Zhou7",
+      "~Zhoufan_Zhu1",
+      "~Qi_Kuang1",
+      "~Liwen_Zhang3"
+    ],
+    "rank": 1480
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ns8v4jHGyAV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      8
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Convolutional neural networks have become the main tools for processing two-dimensional data. They work well for images, yet convolutions have a limited receptive field that prevents its applications to more complex 2D tasks. We propose a new neural model, called Matrix Shuffle-Exchange network, that can efficiently exploit long-range dependencies in 2D data and has comparable speed to a convolutional neural network. It is derived from  Neural Shuffle-Exchange network and has <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-texatom space=\"2\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mrow><mi>n</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> layers and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-texatom space=\"2\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>n</mi><mn>2</mn></msup><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mrow><mi>n</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> total time and space complexity for processing a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi><mo>\u00d7</mo><mi>n</mi></math></mjx-assistive-mml></mjx-container> data matrix. We show that the Matrix Shuffle-Exchange network is well-suited for algorithmic and logical reasoning tasks on matrices and dense graphs, exceeding convolutional and graph neural network baselines. Its distinct advantage is the capability of retaining full long-range dependency modelling when generalizing to larger instances -- much larger than could be processed with models equipped with a dense attention mechanism.",
+    "title": "Matrix Shuffle-Exchange Networks for Hard 2D Tasks",
+    "authors": [
+      "Em\u012bls Ozoli\u0146\u0161",
+      "Karlis Freivalds",
+      "Agris \u0160ostaks"
+    ],
+    "emails": [
+      "lumii.lv",
+      "~Karlis_Freivalds1",
+      "~Em\u012bls_Ozoli\u0146\u01611"
+    ],
+    "rank": 1481
+  },
+  {
+    "url": "https://openreview.net/forum?id=EoVmlONgI9e",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Individuality is essential in human society, which induces the division of labor and thus improves the efficiency and productivity. Similarly, it should also be a key to multi-agent cooperation. Inspired by that individuality is of being an individual separate from others, we propose a simple yet efficient method for the emergence of individuality (EOI) in multi-agent reinforcement learning (MARL). EOI learns a probabilistic classifier that predicts a probability distribution over agents given their observation and gives each agent an intrinsic reward of being correctly predicted by the classifier. The intrinsic reward encourages the agents to visit their own familiar observations, and learning the classifier by such observations makes the intrinsic reward signals stronger and in turn makes the agents more identifiable. To further enhance the intrinsic reward and promote the emergence of individuality, two regularizers are proposed to increase the discriminability of the classifier. We implement EOI on top of popular MARL algorithms. Empirically, we show that EOI outperforms existing methods in a variety of multi-agent cooperative scenarios.",
+    "title": "The Emergence of Individuality in Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Jiechuan Jiang",
+      "Zongqing Lu"
+    ],
+    "emails": [
+      "~Jiechuan_Jiang1",
+      "~Zongqing_Lu2"
+    ],
+    "rank": 1482
+  },
+  {
+    "url": "https://openreview.net/forum?id=a5KvtsZ14ev",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) work well when the graph structure is provided. However, this structure may not always be available in real-world applications. One solution to this problem is to infer the latent structure and then apply a GNN to the inferred graph. Unfortunately, the space of possible graph structures grows super-exponentially with the number of nodes and so the available node labels may be insufficient for learning both the structure and the GNN parameters. In this work, we propose the Simultaneous Learning of Adjacency and GNN Parameters with Self-supervision, or SLAPS, a method that provides more supervision for inferring a graph structure. This approach consists of training a denoising autoencoder GNN in parallel with the task-specific GNN. The autoencoder is trained to reconstruct the initial node features given noisy node features as well as a structure provided by a learnable graph generator. We explore the design space of SLAPS by comparing different graph generation and symmetrization approaches. A comprehensive experimental study demonstrates that SLAPS scales to large graphs with hundreds of thousands of nodes and outperforms several models that have been proposed to learn a task-specific graph structure on established benchmarks.",
+    "title": "SLAPS: Self-Supervision Improves Structure Learning for Graph Neural Networks",
+    "authors": [
+      "Bahare Fatemi",
+      "Seyed Mehran Kazemi",
+      "Layla El Asri"
+    ],
+    "emails": [
+      "~Seyed_Mehran_Kazemi1",
+      "~Bahare_Fatemi1",
+      "~Layla_El_Asri2"
+    ],
+    "rank": 1483
+  },
+  {
+    "url": "https://openreview.net/forum?id=SncSswKUse",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      2,
+      4,
+      3
+    ],
+    "abstract": "A central goal in neurobiology is to relate the expression of genes to the structural and functional properties of neuronal types, collectively called their phenotypes. Single-cell RNA sequencing can measure the expression of thousands of genes in thousands of neurons. How to interpret the data in the context of neuronal phenotypes? We propose a supervised learning approach that factorizes the gene expression data into components corresponding to individual phenotypic characteristics and their interactions. This new method, which we call factorized linear discriminant analysis (FLDA), seeks a linear transformation of gene expressions that varies highly with only one phenotypic factor and minimally with the others. We further leverage our approach with a sparsity-based regularization algorithm, which selects a few genes important to a specific phenotypic feature or feature combination. We applied this approach to a single-cell RNA-Seq dataset of Drosophila T4/T5 neurons, focusing on their dendritic and axonal phenotypes. The analysis confirms results obtained by conventional methods but also points to new genes related to the phenotypes and an intriguing hierarchy in the genetic organization of these cells.",
+    "title": "Factorized linear discriminant analysis for phenotype-guided representation learning of neuronal gene expression data",
+    "authors": [
+      "Mu Qiao",
+      "Markus Meister"
+    ],
+    "emails": [
+      "~Mu_Qiao3",
+      "~Markus_Meister1"
+    ],
+    "rank": 1484
+  },
+  {
+    "url": "https://openreview.net/forum?id=bK-rJMKrOsm",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.33",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Attention layers are widely used in natural language processing (NLP) and are beginning to influence computer vision architectures. However, they suffer from over-parameterization. For instance, it was shown that the majority of attention heads could be pruned without impacting accuracy. This work aims to enhance current understanding on how multiple heads interact. Motivated by the observation that trained attention heads share common key/query projections, we propose a collaborative multi-head attention layer that enables heads to learn shared projections. Our scheme decreases the number of parameters in an attention layer and can be used as a drop-in replacement in any transformer architecture.For instance, by allowing heads to collaborate on a neural machine translation task, we can reduce the key dimension by 4\u00d7 without any loss in performance. We also show that it is possible to re-parametrize a pre-trained multi-head attention layer into our collaborative attention layer. Even without retraining, collaborative multi-head attention manages to reduce the size of the key and query projections by half without sacrificing accuracy. Our code is public.",
+    "title": "Multi-Head Attention: Collaborate Instead of Concatenate",
+    "authors": [
+      "Jean-Baptiste Cordonnier",
+      "Andreas Loukas",
+      "Martin Jaggi"
+    ],
+    "emails": [
+      "~Martin_Jaggi1",
+      "~Jean-Baptiste_Cordonnier2",
+      "~Andreas_Loukas1"
+    ],
+    "rank": 1485
+  },
+  {
+    "url": "https://openreview.net/forum?id=lH2ukHnGDdq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.32",
+    "confidences": [
+      4,
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Deep neural networks have emerged as very successful tools for image restoration and reconstruction tasks. These networks are often trained end-to-end to directly reconstruct an image from a noisy or corrupted measurement of that image. To achieve state-of-the-art performance, training on large and diverse sets of images is considered critical. However, it is often difficult and/or expensive to collect large amounts of training images. Inspired by the success of Data Augmentation (DA) for classification problems, in this paper, we propose a pipeline for data augmentation for image reconstruction tasks arising in medical imaging and explore its effectiveness at reducing the required training data in a variety of settings. We focus on accelerated magnetic resonance imaging, where the goal is to reconstruct an image from a few under-sampled linear measurements. Our DA pipeline is specifically designed to utilize the invariances present in medical imaging measurements as naive DA strategies that neglect the physics of the problem fail. We demonstrate the effectiveness of our data augmentation pipeline by showing that for some problem regimes, DA can achieve comparable performance to the state of the art on the FastMRI dataset while using significantly fewer training data. Specifically, for 8-fold acceleration we achieve performance comparable to the state of the art with only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> of the training data for multi-coil reconstruction and with only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>33</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> of the training data for single-coil reconstruction. Our findings show that in the low-data regime DA is beneficial, whereas in the high-data regime it has diminishing returns.",
+    "title": "Data augmentation for deep learning based accelerated MRI reconstruction",
+    "authors": [
+      "Zalan Fabian",
+      "Reinhard Heckel",
+      "Mahdi Soltanolkotabi"
+    ],
+    "emails": [
+      "~Reinhard_Heckel1",
+      "~Mahdi_Soltanolkotabi1",
+      "~Zalan_Fabian1"
+    ],
+    "rank": 1486
+  },
+  {
+    "url": "https://openreview.net/forum?id=mfJepDyIUcQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      7,
+      3
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Model-based reinforcement learning (RL) has emerged as a promising tool for developing controllers for real world systems (e.g., robotics, autonomous driving, etc.).  However, real systems often have constraints imposed on their state space which must be satisfied to ensure the safety of the system and its environment. Developing a verification tool for RL algorithms is challenging because the non-linear structure of neural networks impedes analytical verification of such models or controllers.  To this end, we present a novel safety verification framework for model-based RL controllers using reachable set analysis.  The proposed framework can efficiently handle models and controllers which are represented using neural networks. Additionally, if a controller fails to satisfy the safety constraints in general, the proposed framework can also be used to identify the subset of initial states from which the controller can be safely executed.",
+    "title": "Safety Verification of Model Based Reinforcement Learning Controllers",
+    "authors": [
+      "akshita gupta",
+      "Inseok Hwang"
+    ],
+    "emails": [
+      "~akshita_gupta1",
+      "~Inseok_Hwang1"
+    ],
+    "rank": 1487
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oq79NOiZB1H",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Graph Convolutional Networks (GCNs) have achieved impressive empirical advancement across a wide variety of graph-related applications. Despite their great success, training GCNs on large graphs suffers from computational and memory issues. A potential path to circumvent these obstacles is sampling-based methods, where at each layer a subset of nodes is sampled. Although recent studies have empirically demonstrated the effectiveness of sampling-based methods, these works lack theoretical convergence guarantees under realistic settings and cannot fully leverage the information of evolving parameters during optimization. In this paper, we describe and analyze a general \\textbf{\\textit{doubly variance reduction}} schema that can accelerate any sampling method under the memory budget. The motivating impetus for the proposed schema is a careful analysis for the variance of sampling methods where it is shown that the induced variance can be decomposed into node embedding approximation variance (\\emph{zeroth-order variance}) during forward propagation and layerwise-gradient variance (\\emph{first-order variance}) during backward propagation. We theoretically analyze the convergence of the proposed schema and show that it enjoys an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>T</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> convergence rate.  We complement our theoretical results by integrating the proposed schema in different sampling methods and applying them to different large real-world graphs.",
+    "title": "On the Importance of Sampling in Training GCNs: Convergence Analysis and Variance Reduction",
+    "authors": [
+      "Weilin Cong",
+      "Morteza Ramezani",
+      "Mehrdad Mahdavi"
+    ],
+    "emails": [
+      "~Weilin_Cong1",
+      "~Mehrdad_Mahdavi2",
+      "~Morteza_Ramezani1"
+    ],
+    "rank": 1488
+  },
+  {
+    "url": "https://openreview.net/forum?id=R5M7Mxl1xZ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Unsupervised image-to-image (I2I) translation, which aims to learn a domain mapping function without paired data, is very challenging because the function is highly under-constrained. Despite the significant progress in constraining the mapping function, current methods suffer from the \\textit{geometry distortion} problem: the geometry structure of the translated image is inconsistent with the input source image, which may cause the undesired distortions in the translated images.  To remedy this issue, we propose a novel I2I translation constraint,  called \\textit{Minimal Geometry-Distortion Constraint} (MGC), which promotes the consistency of geometry structures and reduce the unwanted distortions in translation by reducing the randomness of color transformation in the translation process. To facilitate estimation and maximization of MGC, we propose an approximate representation of mutual information called relative Squared-loss Mutual Information (rSMI) that can be efficiently estimated analytically. We demonstrate the effectiveness of our MGC by providing quantitative and qualitative comparisons with the state-of-the-art methods on several benchmark datasets.\n",
+    "title": "Minimal Geometry-Distortion Constraint for Unsupervised Image-to-Image Translation",
+    "authors": [
+      "Jiaxian Guo",
+      "Jiachen Li",
+      "Mingming Gong",
+      "Huan Fu",
+      "Kun Zhang",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Mingming_Gong1",
+      "~Huan_Fu1",
+      "~Dacheng_Tao1",
+      "~Jiaxian_Guo2",
+      "~Kun_Zhang1",
+      "~Jiachen_Li4"
+    ],
+    "rank": 1489
+  },
+  {
+    "url": "https://openreview.net/forum?id=nIqapkAyZ9_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      5,
+      3,
+      5
+    ],
+    "abstract": "A neural network regularizer (eg, weight decay) boosts performance by explicitly penalizing the complexity of a network. In this paper, we penalize inferior network activations -- feature embeddings -- which in turn regularize the network's weights implicitly. We propose singular value maximization (SVMax) to learn a uniform feature embedding. The SVMax regularizer integrates seamlessly with both supervised and unsupervised learning. During training, our formulation mitigates model collapse and enables larger learning rates. Thus, our formulation converges in fewer epochs, which reduces the training computational cost. We evaluate the SVMax regularizer using both retrieval and generative adversarial networks. We leverage a synthetic mixture of Gaussians dataset to evaluate SVMax in an unsupervised setting. For retrieval networks, SVMax achieves significant improvement margins across various ranking losses.",
+    "title": "SVMax: A Feature Embedding Regularizer",
+    "authors": [
+      "Ahmed Taha",
+      "Alex Hanson",
+      "Abhinav Shrivastava",
+      "Larry S. Davis"
+    ],
+    "emails": [
+      "~Abhinav_Shrivastava2",
+      "~Ahmed_Taha1",
+      "~Alex_Hanson1",
+      "~Larry_S._Davis1"
+    ],
+    "rank": 1490
+  },
+  {
+    "url": "https://openreview.net/forum?id=TlPHO_duLv",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.31",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Training deep object detectors requires large amounts of human-annotated images with accurate object labels and bounding box coordinates, which are extremely expensive to acquire.  Noisy annotations are much more easily accessible, but they could be detrimental for learning. We address the challenging problem of training object detectors with noisy annotations, where the noise contains a mixture of label noise and bounding box noise. We propose a learning framework which jointly optimizes object labels, bounding box coordinates, and model parameters by performing alternating noise correction and model training. To disentangle label noise and bounding box noise, we propose a two-step noise correction method. The first step performs class-agnostic bounding box correction, and the second step performs label correction and class-specific bounding box refinement. We conduct experiments on PASCAL VOC and MS-COCO dataset with both synthetic noise and machine-generated noise. Our method achieves state-of-the-art performance by effectively cleaning both label noise and bounding box noise.",
+    "title": "Towards Noise-resistant Object Detection with Noisy Annotations",
+    "authors": [
+      "Junnan Li",
+      "Caiming Xiong",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Caiming_Xiong1",
+      "~Junnan_Li2"
+    ],
+    "rank": 1491
+  },
+  {
+    "url": "https://openreview.net/forum?id=LIOgGKRCYkG",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      3,
+      2,
+      5
+    ],
+    "abstract": "Recent adversarial defense approaches have failed. Untargeted gradient-based attacks cause classifiers to choose any wrong class. Our novel white-box defense tricks untargeted attacks into becoming attacks targeted at designated target classes. From these target classes, we derive the real classes. The Target Training defense tricks the minimization at the core of untargeted, gradient-based adversarial attacks: minimize the sum of (1) perturbation and (2) classifier adversarial loss. Target Training changes the classifier minimally, and trains it with additional duplicated points (at 0 distance) labeled with designated classes. These differently-labeled duplicated samples minimize both terms (1) and (2) of the minimization, steering attack convergence to samples of designated classes, from which correct classification is derived. Importantly, Target Training eliminates the need to know the attack and the overhead of generating adversarial samples of attacks that minimize perturbations. Without using adversarial samples and against an adaptive attack aware of our defense, Target Training exceeds even default, unsecured classifier accuracy of 84.3% for CIFAR10 with 86.6% against DeepFool attack; and achieves 83.2% against CW-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> (\u03ba=0) attack. Using adversarial samples, we achieve 75.6% against CW-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> (\u03ba=40). Due to our deliberate choice of low-capacity classifiers, Target Training does not withstand <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> adaptive attacks in CIFAR10 but withstands CW-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> (\u03ba=0) in MNIST. Target Training presents a fundamental change in adversarial defense strategy.",
+    "title": "Target Training: Tricking Adversarial Attacks to Fail",
+    "authors": [
+      "Blerta Lindqvist"
+    ],
+    "emails": [
+      "~Blerta_Lindqvist1"
+    ],
+    "rank": 1492
+  },
+  {
+    "url": "https://openreview.net/forum?id=aI8VuzSvCPn",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Program synthesis is the task of automatically generating a program consistent with a given specification. A natural way to specify programs is to provide examples of desired input-output behavior, and many current program synthesis approaches have achieved impressive results after training on randomly generated input-output examples. However, recent work has discovered that some of these approaches generalize poorly to data distributions different from that of the randomly generated examples. We show that this problem applies to other state-of-the-art approaches as well and that current methods to counteract this problem are insufficient. We then propose a new, adversarial approach to control the bias of synthetic data distributions and show that it outperforms current approaches.",
+    "title": "Adversarial Synthetic Datasets for Neural Program Synthesis",
+    "authors": [
+      "Alexander Suh",
+      "Yuval Timen"
+    ],
+    "emails": [
+      "northeastern.edu",
+      "~Alexander_Suh1"
+    ],
+    "rank": 1493
+  },
+  {
+    "url": "https://openreview.net/forum?id=3c3EhwbKoXw",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.31",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Earth observing satellites carrying multi-spectral sensors are widely used to monitor the physical and biological states of the atmosphere, land, and oceans. These satellites have different vantage points above the earth and different spectral imaging bands resulting in inconsistent imagery from one to another. This presents challenges in building downstream applications. What if we could generate synthetic bands for existing satellites from the union of all domains? We tackle the problem of generating synthetic spectral imagery for multispectral sensors as an unsupervised image-to-image translation problem with partial labels and introduce a novel shared spectral reconstruction loss. Simulated experiments performed by dropping one or more spectral bands show that cross-domain reconstruction outperforms measurements obtained from a second vantage point. On a downstream cloud detection task, we show that generating synthetic bands with our model improves segmentation performance beyond our baseline. Our proposed approach enables synchronization of multispectral data and provides a basis for more homogeneous remote sensing datasets.",
+    "title": "Spectral Synthesis for Satellite-to-Satellite Translation",
+    "authors": [
+      "Thomas Vandal",
+      "Daniel McDuff",
+      "Weile Wang",
+      "Andrew Michaelis",
+      "Ramakrishna Nemani"
+    ],
+    "emails": [
+      "nasa.gov",
+      "~Daniel_McDuff1",
+      "gmail.com",
+      "hyperplane.org",
+      "~Thomas_Vandal1"
+    ],
+    "rank": 1494
+  },
+  {
+    "url": "https://openreview.net/forum?id=pg9c6etTWXR",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      4
+    ],
+    "rating": "5.31",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Laplace approximations are classic, computationally lightweight means to construct Bayesian neural networks (BNNs). As in other approximate BNNs, one cannot necessarily expect the induced predictive uncertainty to be calibrated. Here we develop a formalism to explicitly \"train\" the uncertainty in a decoupled way to the prediction itself. To this end we introduce uncertainty units for Laplace-approximated networks: Hidden units with zero weights that can be added to any pre-trained, point-estimated network. Since these units are inactive, they do not affect the predictions. But their presence changes the geometry (in particular the Hessian) of the loss landscape around the point estimate, thereby affecting the network's uncertainty estimates under a Laplace approximation. We show that such units can be trained via an uncertainty-aware objective, making the Laplace approximation competitive with more expensive alternative uncertainty-quantification frameworks.",
+    "title": "Learnable Uncertainty under Laplace Approximations",
+    "authors": [
+      "Agustinus Kristiadi",
+      "Matthias Hein",
+      "Philipp Hennig"
+    ],
+    "emails": [
+      "~Philipp_Hennig1",
+      "~Agustinus_Kristiadi1",
+      "~Matthias_Hein2"
+    ],
+    "rank": 1495
+  },
+  {
+    "url": "https://openreview.net/forum?id=0MjC3uMthAb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.31",
+    "confidences": [
+      1,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Early few-shot classification work advocates for episodic training, i.e. training over learning episodes each posing a few-shot classification task. However, the role of this training regime remains poorly understood, and its usefulness is still debated. Standard classification training methods (``pre-training'') followed by episodic fine-tuning have recently achieved strong results. This work aims to understand the role of this episodic fine-tuning phase through an exploration of the effect of the ``shot'' setting (number of examples per class) that is used during fine-tuning. We discover that fine-tuning on episodes of a particular shot can specialize the pre-trained model to solving episodes of that shot at the expense of performance on other shots, in agreement with a trade-off recently observed in the context of end-to-end episodic training. To amend this, we propose a shot-conditional form of episodic fine-tuning, inspired from recent work that trains a single model on a distribution of losses. Our investigation shows that this improves overall performance, without suffering disproportionately on any shot. We also examine the usefulness of this approach on the large-scale Meta-Dataset benchmark where test episodes exhibit varying shots and imbalanced classes. We find that our flexible model improves performance in that challenging environment.",
+    "title": "Learning Flexible Classifiers with Shot-CONditional Episodic (SCONE) Training",
+    "authors": [
+      "Eleni Triantafillou",
+      "Vincent Dumoulin",
+      "Hugo Larochelle",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Eleni_Triantafillou1",
+      "~Vincent_Dumoulin1",
+      "~Richard_Zemel1",
+      "~Hugo_Larochelle1"
+    ],
+    "rank": 1496
+  },
+  {
+    "url": "https://openreview.net/forum?id=OLrVttqVt2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      3
+    ],
+    "rating": "5.31",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "In a poisoning attack, an adversary with control over a small fraction of the training data attempts to select that data in a way that induces a model that misbehaves in a particular way desired by the adversary, such as misclassifying certain inputs. We propose an efficient poisoning attack that can target a desired model based on online convex optimization. Unlike previous model-targeted poisoning attacks, our attack comes with provable convergence to any achievable target classifier. The distance from the induced classifier to the target classifier is inversely proportional to the square root of the number of poisoning points. We also provide a lower bound on the minimum number of poisoning points needed to achieve a given target classifier. Our attack is the first model-targeted poisoning attack that provides provable convergence, and in our experiments it either exceeds or matches the best state-of-the-art attacks in terms of attack success rate and distance to the target model. In addition, as an online attack our attack can incrementally determine nearly optimal poisoning points. ",
+    "title": "Model-Targeted Poisoning Attacks with Provable Convergence",
+    "authors": [
+      "Fnu Suya",
+      "Saeed Mahloujifar",
+      "David Evans",
+      "Yuan Tian"
+    ],
+    "emails": [
+      "~Fnu_Suya1",
+      "~Saeed_Mahloujifar1",
+      "~Yuan_Tian2",
+      "~David_Evans1"
+    ],
+    "rank": 1497
+  },
+  {
+    "url": "https://openreview.net/forum?id=bNdohBx9sPa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.31",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Unsupervised domain adaptation transfers knowledge from learned source domain to a different (but related) target distribution, for which only few or no labeled data is available. Some researchers proposed upper bounds for the target error when transferring the knowledge, i.e.,Ben-David et al. (2010) established a theory based on minimizing the source error and distance between marginal distributions simultaneously. However, in most works the joint error is usually ignored. In this paper, we argue that the joint error is essential for the domain adaptation problem, in particular if the samples from different classes in source/target are closely aligned when matching the marginal distributions. To tackle this problem, we propose a novel upper bound that includes the joint error. Moreover, we utilize a constrained hypothesis space to further tighten up this bound. Furthermore, we propose a novel cross margin discrepancy to measure the dissimilarity between hypotheses. We show in this paper that the new cross margin discrepancy is able to alleviate instability during adversarial learning. In addition, we present extensive empirical evidence that shows that our proposal outperforms related approaches in image classification error rates on standard domain adaptation benchmarks.",
+    "title": "Unsupervised Domain Adaptation via Minimized Joint Error",
+    "authors": [
+      "Dexuan Zhang",
+      "Tatsuya Harada"
+    ],
+    "emails": [
+      "~Dexuan_Zhang1",
+      "~Tatsuya_Harada1"
+    ],
+    "rank": 1498
+  },
+  {
+    "url": "https://openreview.net/forum?id=4sCyjwaVtZ9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7,
+      7
+    ],
+    "rating": "5.31",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Machine learning is predicated on the concept of generalization: a model achieving low error on a sufficiently large training set should also perform well on novel samples from the same distribution. We show that both data whitening and second order optimization can harm or entirely prevent generalization. In general, model training harnesses information contained in the sample-sample second moment matrix of a dataset. For a general class of models, namely models with a fully connected first layer, we prove that the information contained in this matrix is the only information which can be used to generalize. Models trained using whitened data, or with certain second order optimization schemes, have less access to this information; in the high dimensional regime they have no access at all, resulting in poor or nonexistent generalization ability. We experimentally verify these predictions for several architectures, and further demonstrate that generalization continues to be harmed even when theoretical requirements are relaxed. However, we also show experimentally that regularized second order optimization can provide a practical tradeoff, where training is accelerated but less information is lost, and generalization can in some circumstances even improve.",
+    "title": "Whitening and second order optimization both destroy information about the dataset, and can make generalization impossible",
+    "authors": [
+      "Neha S. Wadia",
+      "Daniel Duckworth",
+      "Samuel Stern Schoenholz",
+      "Ethan Dyer",
+      "Jascha Sohl-Dickstein"
+    ],
+    "emails": [
+      "~Jascha_Sohl-Dickstein2",
+      "~Ethan_Dyer1",
+      "~Neha_S._Wadia1",
+      "~Samuel_Stern_Schoenholz1",
+      "~Daniel_Duckworth1"
+    ],
+    "rank": 1499
+  },
+  {
+    "url": "https://openreview.net/forum?id=hecuSLbL_vC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.30",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "In Continual Learning settings, deep neural networks are prone to Catastrophic Forgetting. Orthogonal Gradient Descent (Farajtabar et al., 2019) was proposed to tackle the challenge. However, no theoretical guarantees have been proven yet. We present a theoretical framework to study Continual Learning algorithms in the NTK regime. This framework comprises closed form expression of the model through tasks and proxies for transfer learning, generalisation and tasks similarity. In this framework, we prove that OGD is robust to Catastrophic Forgetting then derive the first generalisation bound for SGD and OGD for Continual Learning. Finally, we study the limits of this framework in practice for OGD and highlight the importance of the NTK variation for Continual Learning.",
+    "title": "Generalisation Guarantees For Continual Learning With Orthogonal Gradient Descent",
+    "authors": [
+      "Mehdi Abbana Bennani",
+      "Thang Doan",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Mehdi_Abbana_Bennani1",
+      "~Thang_Doan1",
+      "~Masashi_Sugiyama1"
+    ],
+    "rank": 1500
+  },
+  {
+    "url": "https://openreview.net/forum?id=zgGmAx9ZcY",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.30",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Feedback alignment was proposed to address the biological implausibility of the backpropagation algorithm which requires the transportation of the weight transpose during the backwards pass. The idea was later built upon with the proposal of direct feedback alignment (DFA), which propagates the error directly from the output layer to each hidden layer in the backward path using a fixed random weight matrix. This contribution was significant because it allowed for the parallelization of the backwards pass by the use of these feedback connections. However, just as feedback alignment, DFA does not perform well in deep convolutional networks. We propose to learn the backward weight matrices in DFA, adopting the methodology of Kolen-Pollack learning, to improve training and inference accuracy in deep convolutional neural networks by updating the direct feedback connections such that they come to estimate the forward path. The proposed method improves the accuracy of learning by direct feedback connections and reduces the gap between parallel training to serial training by means of backpropagation.",
+    "title": "Learning the Connections in Direct Feedback Alignment",
+    "authors": [
+      "Matthew Bailey Webster",
+      "Jonghyun Choi",
+      "changwook Ahn"
+    ],
+    "emails": [
+      "~Matthew_Bailey_Webster1",
+      "gist.ac.kr",
+      "~Jonghyun_Choi1"
+    ],
+    "rank": 1501
+  },
+  {
+    "url": "https://openreview.net/forum?id=uUlGTEbBRL",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.30",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Many designs have recently been proposed to improve the model efficiency of convolutional neural networks (CNNs) at a fixed resource budget, while there is a lack of theoretical analysis to justify them. This paper first formulates CNNs with high-order inputs into statistical models, which have a special \"Tucker-like\" formulation. This makes it possible to further conduct the sample complexity analysis to CNNs as well as compressed CNNs via tensor decomposition. Tucker and CP decompositions are commonly adopted to compress CNNs in the literature. The low rank assumption is usually imposed on the output channels, which according to our study, may not be beneficial to obtain a computationally efficient model while a similar accuracy can be maintained. Our finding is further supported by ablation studies on CIFAR10, SVNH and UCF101 datasets.",
+    "title": "Rethinking Compressed Convolution Neural Network from a Statistical Perspective",
+    "authors": [
+      "Feiqing Huang",
+      "Yuefeng Si",
+      "Guodong Li"
+    ],
+    "emails": [
+      "~Feiqing_Huang1",
+      "hku.hk"
+    ],
+    "rank": 1502
+  },
+  {
+    "url": "https://openreview.net/forum?id=r1d-lFmO-cM",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5
+    ],
+    "rating": "5.30",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Ordinary (pointwise) binary classification aims to learn a binary classifier from pointwise labeled data. However, such pointwise labels may not be directly accessible due to privacy, confidentiality, or security considerations. In this case, can we still learn an accurate binary classifier? This paper proposes a novel setting, namely pairwise comparison (Pcomp) classification, where we are given only pairs of unlabeled data that we know one is more likely to be positive than the other, instead of pointwise labeled data. Compared with pointwise labels, pairwise comparisons are easier to collect, and Pcomp classification is useful for subjective classification tasks. To solve this problem, we present a mathematical formulation for the generation process of pairwise comparison data, based on which we exploit an unbiased risk estimator (URE) to train a binary classifier by empirical risk minimization and establish an estimation error bound. We first prove that a URE can be derived and improve it using correction functions. Then, we start from the noisy-label learning perspective to introduce a progressive URE and improve it by imposing consistency regularization. Finally, experiments validate the effectiveness of our proposed solutions for Pcomp classification.",
+    "title": "Pointwise Binary Classification with Pairwise Confidence Comparisons",
+    "authors": [
+      "Lei Feng",
+      "Senlin Shu",
+      "Nan Lu",
+      "Bo Han",
+      "Miao Xu",
+      "Gang Niu",
+      "Bo An",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Lei_Feng1",
+      "~Masashi_Sugiyama1",
+      "~Miao_Xu1",
+      "~Nan_Lu1",
+      "~Bo_An2",
+      "email.swu.edu.cn",
+      "~Bo_Han1",
+      "~Gang_Niu1"
+    ],
+    "rank": 1503
+  },
+  {
+    "url": "https://openreview.net/forum?id=bIQF55zCpWf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.30",
+    "confidences": [
+      4,
+      2,
+      3,
+      1
+    ],
+    "abstract": "Regularization plays a crucial role in machine learning models, especially for deep neural networks. The existing regularization techniques mainly rely on the i.i.d. assumption and only consider the knowledge from the current sample, without the leverage of the neighboring relationship between samples. In this work, we propose a general regularizer called Patch-level Neighborhood Interpolation~(\\textbf{Pani}) that conducts a non-local representation in the computation of network. Our proposal explicitly constructs patch-level graphs in different network layers and then linearly interpolates neighborhood patch features, serving as a general and effective regularization strategy. Further, we customize our approach into two kinds of popular regularization methods, namely Virtual Adversarial Training (VAT) and MixUp as well as its variants. The first derived \\textbf{Pani VAT} presents a novel way to construct non-local adversarial smoothness by employing patch-level interpolated perturbations. In addition, the second derived \\textbf{Pani MixUp} method extends the original MixUp regularization and its variant to the Pani version, achieving a  significant improvement in the performance. Finally, extensive experiments are conducted to verify the effectiveness of our Patch-level Neighborhood Interpolation approach in both supervised and semi-supervised settings.",
+    "title": "Patch-level Neighborhood Interpolation:  A General and Effective Graph-based Regularization Strategy",
+    "authors": [
+      "Ke Sun",
+      "Bing Yu",
+      "Zhouchen Lin",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "~Zhouchen_Lin1",
+      "~Zhanxing_Zhu1",
+      "~Ke_Sun3",
+      "~Bing_Yu1"
+    ],
+    "rank": 1504
+  },
+  {
+    "url": "https://openreview.net/forum?id=LxhlyKH6VP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Understanding the protein conformational landscape is critical, as protein function, as well as modulations thereof due to ligand binding or changes in environment, are intimately connected with structural variations. This work focuses on learning a generative neural network on a simulated ensemble of protein structures obtained using molecular simulation to characterize the distinct structural fluctuations of a protein bound to various drug molecules. Specifically, we use a  geometric autoencoder framework to learn separate latent space encodings of  the intrinsic and extrinsic geometries of the system. For this purpose, the proposed Protein Geometric AutoEncoder (ProGAE) model is trained on  the length of the alpha-carbon pseudobonds and the  orientation of the backbone bonds of the protein. Using ProGAE latent embeddings, we reconstruct and generate the conformational ensemble of a protein at or near the experimental resolution. Empowered by the disentangled latent space learning, the intrinsic latent embedding help in geometric error correction, whereas the extrinsic latent embedding is successfully used for classification or property prediction of  different drugs bound to a specific protein. Additionally, ProGAE is able to be transferred to the structures of a different state of the same protein or to a completely different protein of different size, where only the dense layer decoding from the latent representation needs to be retrained. Results show that our geometric learning-based method enjoys both accuracy and efficiency for generating complex structural variations, charting the path toward scalable and improved approaches for analyzing and enhancing molecular simulations.",
+    "title": "ProGAE: A Geometric Autoencoder-based Generative Model for  Disentangling Protein Conformational Space",
+    "authors": [
+      "Norman Joseph Tatro",
+      "Payel Das",
+      "Pin-Yu Chen",
+      "Vijil Chenthamarakshan",
+      "Rongjie Lai"
+    ],
+    "emails": [
+      "~Pin-Yu_Chen1",
+      "~Norman_Joseph_Tatro1",
+      "~Payel_Das1",
+      "~Rongjie_Lai4",
+      "~Vijil_Chenthamarakshan1"
+    ],
+    "rank": 1505
+  },
+  {
+    "url": "https://openreview.net/forum?id=HUd2wQ0j200",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.29",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Recent breakthroughs of Neural Architecture Search (NAS) are extending the field's research scope towards a broader range of vision tasks and more diversified search spaces. While existing NAS methods mostly design architectures on one single task, algorithms that look beyond single-task search are surging to pursue a more efficient and universal solution across various tasks. Many of them leverage transfer learning and seek to preserve, reuse, and refine network design knowledge to achieve higher efficiency in future tasks. However, the huge computational cost and experiment complexity of cross-task NAS are imposing barriers for valuable research in this direction. Existing transferrable NAS algorithms are also based on different settings, e.g., datasets and search spaces, which raises concerns on performance comparability. Although existing NAS benchmarks provided some solutions, they all focus on one single type of vision task, i.e., classification. In this work, we propose TransNAS-Bench-101, a benchmark containing network performance across 7 tasks, covering classification, regression, pixel-level prediction, and self-supervised tasks. This diversity provides opportunities to transfer NAS methods among the tasks, and allows for more complex transfer schemes to evolve. We explore two fundamentally different types of search spaces: cell-level search space and macro-level search space. With 7,352 backbones evaluated on 7 tasks, 51,464 trained models with detailed training information are provided. Generating this benchmark takes about 193,760 GPU hours, which is equivalent to 22.12 years of computation on a single Nvidia V100 GPU. Analysis of 4 benchmark transfer schemes highlights that: (1) Direct deployment of both architectures and policies can easily lead to negative transfer unless guided by carefully designed mechanisms. (2) Evolutionary methods' role in transferrable NAS might be overlooked in the past. (3) It is a valid challenge for NAS algorithms to perform well across tasks and search spaces consistently. We also provide our suggestions for future research along with the analysis. With TransNAS-Bench-101, we hope to encourage the advent of exceptional NAS algorithms that raise cross-task search efficiency and generalizability to the next level.",
+    "title": "TransNAS-Bench-101: Improving Transferrability and Generalizability of Cross-Task Neural Architecture Search",
+    "authors": [
+      "Yawen Duan",
+      "Xin Chen",
+      "Hang Xu",
+      "Zewei Chen",
+      "Xiaodan Liang",
+      "Tong Zhang",
+      "Zhenguo Li"
+    ],
+    "emails": [
+      "~Tong_Zhang2",
+      "~Xin_Chen15",
+      "~Hang_Xu1",
+      "~Yawen_Duan1",
+      "~Zewei_Chen1",
+      "~Xiaodan_Liang2",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 1506
+  },
+  {
+    "url": "https://openreview.net/forum?id=9t0CV2iD5gE",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      5,
+      3
+    ],
+    "rating": "5.29",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper proposes SplitSGD, a new dynamic learning rate schedule for stochastic optimization. This method decreases the learning rate for better adaptation to the local geometry of the objective function whenever a stationary phase is detected, that is, the iterates are likely to bounce at around a vicinity of a local minimum. The detection is performed by splitting the single thread into two and using the inner product of the gradients from the two threads as a measure of stationarity. Owing to this simple yet provably valid stationarity detection, SplitSGD is easy- to-implement and essentially does not incur additional computational cost than standard SGD. Through a series of extensive experiments, we show that this method is appropriate for both convex problems and training (non-convex) neural networks, with performance compared favorably to other stochastic optimization methods. Importantly, this method is observed to be very robust with a set of default parameters for a wide range of problems and, moreover, yields better generalization performance than other adaptive gradient methods such as Adam.",
+    "title": "Robust Learning Rate Selection for Stochastic Optimization via Splitting Diagnostic",
+    "authors": [
+      "Matteo Sordello",
+      "Hangfeng He",
+      "Weijie J Su"
+    ],
+    "emails": [
+      "~Hangfeng_He3",
+      "~Weijie_J_Su1",
+      "~Matteo_Sordello1"
+    ],
+    "rank": 1507
+  },
+  {
+    "url": "https://openreview.net/forum?id=qmI0P1ZExUl",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.29",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "We present a generic image-to-image translation framework, Pixel2Style2Pixel (pSp). Our pSp framework is based on a novel encoder network that directly generates a series of style vectors which are fed into a pretrained StyleGAN generator, forming the extended W+ latent space. We first show that our encoder can directly embed real images into W+, with no additional optimization. We further introduce a dedicated identity loss which is shown to achieve improved performance in the reconstruction of an input image. We demonstrate pSp to be a simple architecture that, by leveraging a well-trained, fixed generator network, can be easily applied on a wide-range of image-to-image translation tasks. Solving these tasks through the style representation results in a global approach that does not rely on a local pixel-to-pixel correspondence and further supports multi-modal synthesis via the resampling of styles. Notably, we demonstrate that pSp can be trained to align a face image to a frontal pose with no labeled data and generate multi-modal results for ambiguous tasks such as conditional face generation from sketches and segmentation maps.",
+    "title": "Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation",
+    "authors": [
+      "Elad Richardson",
+      "Yuval Alaluf",
+      "Or Patashnik",
+      "Yotam Nitzan",
+      "Yaniv Azar",
+      "Stav Shapiro",
+      "Daniel Cohen-or"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Daniel_Cohen-or2",
+      "~Stav_Shapiro1",
+      "~Elad_Richardson2"
+    ],
+    "rank": 1508
+  },
+  {
+    "url": "https://openreview.net/forum?id=vT0NSQlTA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Learning complex behaviors through interaction requires coordinated long-term planning. Random exploration and novelty search lack task-centric guidance and waste effort on non-informative interactions. Instead, decision making should target samples with the potential to optimize performance far into the future, while only reducing uncertainty where conducive to this objective. This paper presents latent optimistic value exploration (LOVE), a strategy that enables deep exploration through optimism in the face of uncertain long-term rewards. We combine finite-horizon rollouts from a latent model with value function estimates to predict infinite-horizon returns and recover associated uncertainty through ensembling. Policy training then proceeds on an upper confidence bound (UCB) objective to identify and select the interactions most promising to improve long-term performance. We apply LOVE to continuous visual control tasks and demonstrate improved sample complexity on a selection of benchmarking tasks. ",
+    "title": "Learning to Plan Optimistically: Uncertainty-Guided Deep Exploration via Latent Model Ensembles",
+    "authors": [
+      "Tim Seyde",
+      "Wilko Schwarting",
+      "Sertac Karaman",
+      "Daniela Rus"
+    ],
+    "emails": [
+      "~Sertac_Karaman1",
+      "~Wilko_Schwarting1",
+      "~Daniela_Rus1",
+      "~Tim_Seyde1"
+    ],
+    "rank": 1509
+  },
+  {
+    "url": "https://openreview.net/forum?id=VRgITLy0l2",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.29",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "In this paper, we perform Lyapunov based analysis of the loss function to derive an a priori upper bound on the settling time of deep neural networks. While previous studies have attempted to understand deep learning using control theory framework, there is limited work on a priori finite time convergence analysis. Drawing from the advances in analysis of finite-time control of non-linear systems, we provide a priori guarantees of finite-time convergence in a deterministic control theoretic setting. We formulate the supervised learning framework as a control problem where weights of the network are control inputs and learning translates into a tracking problem. An analytical formula for finite-time upper bound on settling time is provided a priori under the assumptions of boundedness of input. Finally, we prove that our loss function is robust against input perturbations. ",
+    "title": "A priori guarantees of finite-time convergence for Deep Neural Networks",
+    "authors": [
+      "Anushree Rankawat",
+      "Mansi Rankawat",
+      "Harshal B. Oza"
+    ],
+    "emails": [
+      "~Mansi_Rankawat1",
+      "sot.pdpu.ac.in",
+      "~Anushree_Rankawat1"
+    ],
+    "rank": 1510
+  },
+  {
+    "url": "https://openreview.net/forum?id=y13JLBiNMsf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Transformers with soft attention have been widely adopted in various sequence-to-sequence (Seq2Seq) tasks. Whereas soft attention is effective for learning semantic similarities between queries and keys based on their contents, it does not explicitly model the order of elements in sequences which is crucial for monotonic Seq2Seq tasks. Learning monotonic alignments between input and output sequences may be beneficial for long-form and online inference applications that are still challenging for the conventional soft attention algorithm. Herein, we focus on monotonic Seq2Seq tasks and propose a source-aware Gaussian mixture model attention in which the attention scores are monotonically calculated considering both the content and order of the source sequence. We experimentally demonstrate that the proposed attention mechanism improved the performance on the online and long-form speech recognition problems without performance degradation in offline in-distribution speech recognition.",
+    "title": "Learning Monotonic Alignments with Source-Aware GMM Attention",
+    "authors": [
+      "Tae Gyoon Kang",
+      "Ho-Gyeong Kim",
+      "Min-Joong Lee",
+      "Jihyun Lee",
+      "Seongmin Ok",
+      "Hoshik Lee",
+      "Young Sang Choi"
+    ],
+    "emails": [
+      "~Seongmin_Ok1",
+      "samsung.com",
+      "~Tae_Gyoon_Kang1"
+    ],
+    "rank": 1511
+  },
+  {
+    "url": "https://openreview.net/forum?id=qpsl2dR9twy",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Communication is one of the core components for learning coordinated behavior in multi-agent systems.\nIn this paper, we propose a new communication scheme named  Intention Sharing (IS) for multi-agent reinforcement learning in order to enhance the coordination among agents. In the proposed IS scheme, each agent generates an imagined trajectory by modeling the environment dynamics and other agents' actions. The imagined trajectory is the simulated future trajectory of each agent based on the learned model of the environment dynamics and other agents and represents each agent's future action plan. Each agent compresses this imagined trajectory capturing its future action plan to generate its intention message for communication by applying an attention mechanism to learn the relative importance of the components in the imagined trajectory based on the received message from other agents. Numeral results show that the proposed IS scheme outperforms other communication schemes in multi-agent reinforcement learning.",
+    "title": "Communication in Multi-Agent Reinforcement Learning: Intention Sharing",
+    "authors": [
+      "Woojun Kim",
+      "Jongeui Park",
+      "Youngchul Sung"
+    ],
+    "emails": [
+      "~Youngchul_Sung1",
+      "~Woojun_Kim1",
+      "~Jongeui_Park1"
+    ],
+    "rank": 1512
+  },
+  {
+    "url": "https://openreview.net/forum?id=Hf3qXoiNkR",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      7,
+      7,
+      2
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "State-of-the-art natural language processing (NLP) models often learn to model dataset biases and surface form correlations instead of features that target the intended underlying task. Previous work has demonstrated effective methods to circumvent these issues when knowledge of the bias is available. We consider cases where the bias issues may not be explicitly identified, and show a method for training models that learn to ignore these problematic correlations. Our approach relies on the observation that models with limited capacity primarily learn to exploit biases in the dataset. We can leverage the errors of such limited capacity models to train a more robust model in a product of experts, thus bypassing the need to hand-craft a biased model. We show the effectiveness of this method to retain improvements in out-of-distribution settings even if no particular bias is targeted by the biased model.",
+    "title": "Learning from others' mistakes: Avoiding dataset biases without modeling them",
+    "authors": [
+      "Victor Sanh",
+      "Thomas Wolf",
+      "Yonatan Belinkov",
+      "Alexander M Rush"
+    ],
+    "emails": [
+      "~Victor_Sanh1",
+      "~Thomas_Wolf1",
+      "~Yonatan_Belinkov1",
+      "~Alexander_M_Rush1"
+    ],
+    "rank": 1513
+  },
+  {
+    "url": "https://openreview.net/forum?id=1toB0Fo9CZy",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      4,
+      6
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "In this paper, we propose a new neural architecture search (NAS) problem of Symmetric Positive Definite (SPD) manifold networks. Unlike the conventional NAS problem, our problem requires to search for a unique computational cell called the SPD cell. This SPD cell serves as a basic building block of SPD neural architectures. An efficient solution to our problem is important to minimize the extraneous manual effort in the SPD neural architecture design. To accomplish this goal, we first introduce a geometrically rich and diverse SPD neural architecture search space for an efficient SPD cell design. Further, we model our new NAS problem using the supernet strategy, which models the architecture search problem as a one-shot training process of a single supernet. Based on the supernet modeling, we exploit a differentiable NAS algorithm on our relaxed continuous search space for SPD neural architecture search. Statistical evaluation of our method on drone, action, and emotion recognition tasks mostly provides better results than the state-of-the-art SPD networks and NAS algorithms. Empirical results show that our algorithm excels in discovering better SPD network design and providing models that are more than 3 times lighter than searched by state-of-the-art NAS algorithms.",
+    "title": "Neural Architecture Search of SPD Manifold Networks",
+    "authors": [
+      "Rhea Sanjay Sukthanker",
+      "Zhiwu Huang",
+      "Suryansh Kumar",
+      "Erik Goron",
+      "Yan Wu",
+      "Luc Van Gool"
+    ],
+    "emails": [
+      "~Luc_Van_Gool1",
+      "~Erik_Goron1",
+      "~Suryansh_Kumar1",
+      "~Rhea_Sanjay_Sukthanker1",
+      "~Yan_Wu4",
+      "~Zhiwu_Huang1"
+    ],
+    "rank": 1514
+  },
+  {
+    "url": "https://openreview.net/forum?id=4K_NaDAHc0d",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.29",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Meta-learning, transfer learning and multi-task learning have recently laid a path towards more generally applicable reinforcement learning agents that are not limited to a single task. However, most existing approaches implicitly assume a uniform similarity between tasks. We argue that this assumption is limiting in settings where the relationship between tasks is unknown a-priori. In his work, we propose a general approach to automatically cluster together similar tasks during training. Our method, inspired by the expectation-maximization algorithm, succeeds at finding clusters of related tasks and uses these to improve sample complexity. We achieve this by designing an agent with multiple policies. In the expectation step, we evaluate the performance of the policies on all tasks and assign each task to the best performing policy. In the maximization step, each policy trains by sampling tasks from its assigned set. This method is intuitive, simple to implement and orthogonal to other multi-task learning algorithms. We show the generality of our approach by evaluating on simple discrete and continuous control tasks, as well as complex bipedal walker tasks and Atari games. Results show improvements in sample complexity as well as a more general applicability when compared to other approaches.",
+    "title": "Unsupervised Task Clustering for Multi-Task Reinforcement Learning",
+    "authors": [
+      "Johannes Ackermann",
+      "Oliver Paul Richter",
+      "Roger Wattenhofer"
+    ],
+    "emails": [
+      "~Roger_Wattenhofer1",
+      "~Johannes_Ackermann1",
+      "~Oliver_Paul_Richter1"
+    ],
+    "rank": 1515
+  },
+  {
+    "url": "https://openreview.net/forum?id=9SS69KwomAM",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      6,
+      5,
+      3
+    ],
+    "rating": "5.29",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We propose a novel learning paradigm, Self-Imitation via Reduction (SIR), for solving compositional reinforcement learning problems. SIR is based on two core ideas: task reduction and self-imitation. Task reduction tackles a hard-to-solve task by actively reducing it to an easier task whose solution is known by the RL agent. Once the original hard task is successfully solved by task reduction, the agent naturally obtains a self-generated solution trajectory to imitate. By continuously collecting and imitating such demonstrations, the agent is able to progressively expand the solved subspace in the entire task space. Experiment results show that SIR can significantly accelerate and improve learning on a variety of challenging sparse-reward continuous-control problems with compositional structures. Code and videos are available at <a href=\"https://sites.google.com/view/sir-compositional\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/sir-compositional</a>.",
+    "title": "Solving Compositional Reinforcement Learning Problems via Task Reduction",
+    "authors": [
+      "Yunfei Li",
+      "Yilin Wu",
+      "Huazhe Xu",
+      "Xiaolong Wang",
+      "Yi Wu"
+    ],
+    "emails": [
+      "~Yunfei_Li1",
+      "~Huazhe_Xu1",
+      "gmail.com",
+      "~Xiaolong_Wang3",
+      "~Yi_Wu1"
+    ],
+    "rank": 1516
+  },
+  {
+    "url": "https://openreview.net/forum?id=4I5THWNSjC",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      3,
+      6
+    ],
+    "rating": "5.29",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "We present BasisNet which combines recent advancements in efficient neural network architectures, conditional computation, and early termination in a simple new form. Our approach uses a lightweight model to preview an image and generate input-dependent combination coefficients, which are later used to control the synthesis of a specialist model for making more accurate final prediction. The two-stage model synthesis strategy can be used with any network architectures and both stages can be jointly trained end to end. We validated BasisNet on ImageNet classification with MobileNets as backbone, and demonstrated clear advantage on accuracy-efficiency trade-off over strong baselines such as EfficientNet (Tan &amp; Le, 2019), FBNetV3 (Dai et al., 2020) and OFA (Cai et al., 2019). Specifically, BasisNet-MobileNetV3 obtained 80.3% top-1 accuracy with only 290M Multiply-Add operations (MAdds), halving the computational cost of previous state-of-the-art without sacrificing accuracy. Besides, since the first-stage lightweight model can independently make predictions, inference can be terminated early if the prediction is sufficiently confident. With early termination, the average cost can be further reduced to 198M MAdds while maintaining accuracy of 80.0%.",
+    "title": "BasisNet: Two-stage Model Synthesis for Efficient Inference",
+    "authors": [
+      "Mingda Zhang",
+      "Andrey Zhmoginov",
+      "Andrew G. Howard",
+      "Brendan Jou",
+      "Yukun Zhu",
+      "Li Zhang",
+      "Rebecca Hwa",
+      "Adriana Kovashka"
+    ],
+    "emails": [
+      "~Yukun_Zhu1",
+      "~Mingda_Zhang1",
+      "google.com",
+      "~Adriana_Kovashka1",
+      "~Andrey_Zhmoginov1",
+      "~Andrew_G._Howard1",
+      "~Rebecca_Hwa1",
+      "~Brendan_Jou1"
+    ],
+    "rank": 1517
+  },
+  {
+    "url": "https://openreview.net/forum?id=_TGlfdZOHY3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.29",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Episodic learning is a popular practice among researchers and practitioners interested in few-shot learning. It consists of organising training in a series of learning problems, each relying on small \u201csupport\u201d and \u201cquery\u201d sets to mimic the few-shot circumstances encountered during evaluation.\nIn this paper, we investigate the usefulness of episodic learning in Prototypical Networks, one of the most popular algorithms making use of this practice.\nSurprisingly, in our experiments we found that, for Prototypical Networks, it is detrimental to use the episodic learning strategy of separating training samples between support and query set, as it is\na data-inefficient way to exploit training batches. This \u201cnon-episodic\u201d version of Prototypical Networks, which corresponds to the classic Neighbourhood Component Analysis, reliably improves over its episodic counterpart in multiple datasets, achieving an accuracy that is competitive with the state-of-the-art, despite being extremely simple.",
+    "title": "On Episodes, Prototypical Networks, and Few-Shot Learning",
+    "authors": [
+      "Steinar Laenen",
+      "Luca Bertinetto"
+    ],
+    "emails": [
+      "~Steinar_Laenen1",
+      "~Luca_Bertinetto1"
+    ],
+    "rank": 1518
+  },
+  {
+    "url": "https://openreview.net/forum?id=oOzqmUtudq",
+    "ratings": [
+      4,
+      6,
+      7,
+      4
+    ],
+    "rating": "5.28",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Deep Metric Learning (DML) provides a crucial tool for visual similarity and zero-shot retrieval applications by learning generalizing embedding spaces, although recent work in DML has shown strong performance saturation across training objectives.\nHowever, generalization capacity is known to scale with the embedding space dimensionality. Unfortunately, high dimensional embeddings also create higher retrieval cost for downstream applications.\nTo remedy this, we propose S2SD - Simultaneous Similarity-based Self-distillation. S2SD extends DML with knowledge distillation from auxiliary, high-dimensional embedding and feature spaces to leverage complementary context during training while retaining test-time cost and with negligible changes to the training time. \nExperiments and ablations across different objectives and standard benchmarks show S2SD offering notable improvements of up to 7% in Recall@1, while also setting a new state-of-the-art.",
+    "title": "S2SD: Simultaneous Similarity-based Self-Distillation for Deep Metric Learning",
+    "authors": [
+      "Karsten Roth",
+      "Timo Milbich",
+      "Bj\u00f6rn Ommer",
+      "Joseph Paul Cohen",
+      "Marzyeh Ghassemi"
+    ],
+    "emails": [
+      "~Marzyeh_Ghassemi1",
+      "~Karsten_Roth1",
+      "~Timo_Milbich2",
+      "~Joseph_Paul_Cohen1",
+      "~Bj\u00f6rn_Ommer2"
+    ],
+    "rank": 1519
+  },
+  {
+    "url": "https://openreview.net/forum?id=I6NRcao1w-X",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Reinforcement Learning (RL) is an effective tool for controller design but can struggle with issues of robustness, failing catastrophically when the underlying system dynamics are perturbed. The Robust RL formulation tackles this by adding worst-case adversarial noise to the dynamics and constructing the noise distribution as the solution to a zero-sum minimax game. However, existing work on learning solutions to the Robust RL formulation has primarily focused on training a single RL agent against a single adversary. In this work, we demonstrate that using a single adversary does not consistently yield robustness to dynamics variations under standard parametrizations of the adversary; the resulting policy is highly exploitable by new adversaries. We propose a population-based augmentation to the Robust RL formulation in which we randomly initialize a population of adversaries and sample from the population uniformly during training. We empirically validate across a variety of benchmarks that the use of an adversarial population results in a less exploitable, more robust policy.  Finally, we demonstrate that this approach provides comparable robustness and generalization as domain randomization on these benchmarks while avoiding a ubiquitous domain randomization failure mode.",
+    "title": "Robust Reinforcement Learning using Adversarial Populations",
+    "authors": [
+      "Eugene Vinitsky",
+      "Yuqing du",
+      "Kanaad V Parvate",
+      "Kathy Jang",
+      "Pieter Abbeel",
+      "Alexandre Bayen"
+    ],
+    "emails": [
+      "~Kanaad_V_Parvate1",
+      "~Alexandre_Bayen2",
+      "~Eugene_Vinitsky1",
+      "~Pieter_Abbeel2",
+      "berkeley.edu",
+      "~Kathy_Jang1"
+    ],
+    "rank": 1520
+  },
+  {
+    "url": "https://openreview.net/forum?id=YzgAOeA67xX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Weight decay is a popular regularization technique for training of deep neural networks. Modern deep learning libraries mainly use <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> regularization as the default implementation of weight decay. \\citet{loshchilov2018decoupled} demonstrated that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> regularization is not identical to weight decay for adaptive gradient methods, such as Adaptive Momentum Estimation (Adam), and proposed Adam with Decoupled Weight Decay (AdamW). However, we found that the popular implementations of weight decay, including <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> regularization and decoupled weight decay, in modern deep learning libraries usually damage performance. First, the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> regularization is unstable weight decay for all optimizers that use Momentum, such as stochastic gradient descent (SGD). Second, decoupled weight decay is highly unstable for all adaptive gradient methods. We further propose the Stable Weight Decay (SWD) method to fix the unstable weight decay problem from a dynamical perspective. The proposed SWD method makes significant improvements over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container> regularization and decoupled weight decay in our experiments. Simply fixing weight decay in Adam by SWD, with no extra hyperparameter, can outperform complex Adam variants, which have more hyperparameters. ",
+    "title": "Stable Weight Decay Regularization",
+    "authors": [
+      "Zeke Xie",
+      "Issei Sato",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Zeke_Xie1",
+      "~Issei_Sato1",
+      "~Masashi_Sugiyama1"
+    ],
+    "rank": 1521
+  },
+  {
+    "url": "https://openreview.net/forum?id=aKt7FHPQxVV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Neural architecture search (NAS) automatically designs effective network architectures. Differentiable NAS with supernets that encompass all potential architectures in a large graph cuts down search overhead to few GPU days or less. However, these algorithms consume massive GPU memory, which will restrain NAS from large batch sizes and large search spaces (e.g., more candidate operations, diverse cell structures, and large depth of supernets). In this paper, we present binary neural architecture search (NASB) with consecutive model parallel (CMP) to tackle the problem of insufficient GPU memory. CMP aggregates memory from multiple GPUs for supernets. It divides forward/backward phases into several sub-tasks and executes the same type of sub-tasks together to reduce waiting cycles. This approach improves the hardware utilization of model parallel, but it utilizes large GPU memory. NASB is proposed to reduce memory footprint, which excludes inactive operations from computation graphs and computes those operations on the fly for inactive architectural gradients in backward phases. Experiments show that NASB-CMP runs 1.2\u00d7 faster than other model parallel approaches and outperforms state-of-the-art differentiable NAS. NASB can also save twice GPU memory more than PC-DARTS. Finally, we apply NASB-CMP to complicated supernet architectures. Although deep supernets with diverse cell structures do not improve NAS performance, NASB-CMP shows its potential to explore supernet architecture design in large search space.",
+    "title": "Efficient Differentiable Neural Architecture Search with Model Parallelism",
+    "authors": [
+      "Yi-Wei Chen",
+      "Qingquan Song",
+      "Xia Hu"
+    ],
+    "emails": [
+      "~Yi-Wei_Chen1",
+      "~Qingquan_Song1",
+      "~Xia_Hu4"
+    ],
+    "rank": 1522
+  },
+  {
+    "url": "https://openreview.net/forum?id=87Ti3dufEv",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Optimizing with group sparsity is significant in enhancing model interpretability in machining learning applications, e.g., feature selection, compressed sensing and model compression.  However, for large-scale stochastic training problems, effective group-sparsity exploration are typically hard to achieve. Particularly, the state-of-the-art stochastic optimization algorithms usually generate merely dense solutions. To overcome this shortage, we propose a stochastic method\u2014Half-space Stochastic Projected Gradient method (HSPG) to search solutions of high group sparsity while maintain the convergence. Initialized by a simple Prox-SG Step, the HSPG method relies on a novel Half-Space Step to substantially boosts the sparsity level.  Numerically, HSPG demonstrates its superiority in deep neural networks, e.g., VGG16, ResNet18 and MobileNetV1, by computing solutions of higher group sparsity, competitive objective values and generalization accuracy.",
+    "title": "A Half-Space Stochastic Projected Gradient Method for Group Sparsity Regularization",
+    "authors": [
+      "Tianyi Chen",
+      "Guanyi Wang",
+      "Tianyu DING",
+      "Bo Ji",
+      "Sheng Yi",
+      "Zhihui Zhu"
+    ],
+    "emails": [
+      "~Guanyi_Wang1",
+      "~Zhihui_Zhu1",
+      "microsoft.com",
+      "~Tianyu_DING2",
+      "~Bo_Ji2"
+    ],
+    "rank": 1523
+  },
+  {
+    "url": "https://openreview.net/forum?id=nCY83KxoehA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Pretrained contextualized embeddings are powerful word representations for structured prediction tasks. Recent work found that better word representations can be obtained by concatenating different types of embeddings. However, the selection of embeddings to form the best concatenated representation usually varies depending on the task and the collection of candidate embeddings, and the ever-increasing number of embedding types makes it a more difficult problem. In this paper, we propose Automated Concatenation of Embeddings (ACE) to automate the process of finding better concatenations of embeddings for structured prediction tasks, based on a formulation inspired by recent progress on neural architecture search. Specifically, a controller alternately samples a concatenation of embeddings, according to its current belief of the effectiveness of individual embedding types in consideration for a task, and updates the belief based on a reward. We follow strategies in reinforcement learning to optimize the parameters of the controller and compute the reward based on the accuracy of a task model, which is fed with the sampled concatenation as input and trained on a task dataset. Empirical results on 6 tasks and 21 datasets show that our approach outperforms strong baselines and achieves state-of-the-art performance with fine-tuned embeddings in the vast majority of evaluations.",
+    "title": "Automated Concatenation of Embeddings for Structured Prediction",
+    "authors": [
+      "Xinyu Wang",
+      "Yong Jiang",
+      "Nguyen Bach",
+      "Tao Wang",
+      "Zhongqiang Huang",
+      "Fei Huang",
+      "Kewei Tu"
+    ],
+    "emails": [
+      "~Zhongqiang_Huang1",
+      "~Kewei_Tu1",
+      "~Tao_Wang4",
+      "~Yong_Jiang1",
+      "~Nguyen_Bach1",
+      "~Fei_Huang2",
+      "~Xinyu_Wang3"
+    ],
+    "rank": 1524
+  },
+  {
+    "url": "https://openreview.net/forum?id=ptbb7olhGHd",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Machine learning (ML) models are known to be vulnerable to attacks both at training and test time. Despite the extensive literature on adversarial ML, prior efforts focus primarily on applications of computer vision to object recognition or sentiment analysis to movie reviews. In these settings, the incentives for adversaries to manipulate the model's prediction are often unclear and attacks require extensive control of direct inputs to the model. This makes it difficult to evaluate how severe the impact of vulnerabilities exposed is on systems deploying ML with little provenance guarantees for the input data. In this paper, we study adversarial ML with stock price forecasting. Adversarial incentives are clear and may be quantified experimentally through a simulated portfolio.  We replicate an industry standard pipeline, which performs a sentiment analysis of Twitter data to forecast trends in stock prices. We show that an adversary can exploit the lack of provenance to indirectly use tweets to  manipulate the model's perceived sentiment about a target company and in turn force the model to forecast price erroneously. Our attack is mounted at test time and does not modify the training data. Given past market anomalies, we conclude with a series of recommendations for the use of machine learning as input signal to trading algorithms. ",
+    "title": "On the Robustness of Sentiment Analysis for Stock Price Forecasting",
+    "authors": [
+      "Gabriel Deza",
+      "Colin Rowat",
+      "Nicolas Papernot"
+    ],
+    "emails": [
+      "bham.ac.uk",
+      "~Gabriel_Deza1",
+      "~Nicolas_Papernot1"
+    ],
+    "rank": 1525
+  },
+  {
+    "url": "https://openreview.net/forum?id=uV7hcsjqM-",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Machine-aided programming tools such as automated type predictors and autocomplete are increasingly learning-based. However, current approaches predominantly rely on supervised learning with task-specific datasets. We propose Contrastive Code Representation Learning (ContraCode), a self-supervised algorithm for learning task-agnostic semantic representations of programs via contrastive learning. Our approach uses no human-provided labels, only the raw text of programs. ContraCode optimizes for a representation that is invariant to semantic-preserving code transformations. We develop an automated source-to-source compiler that generates textually divergent variants of source programs. We then train a neural network to identify variants of anchor programs within a large batch of non-equivalent negatives. To solve this task, the network must extract features representing the functionality, not form, of the program. In experiments, we pre-train ContraCode with 1.8M unannotated JavaScript methods mined from GitHub, then transfer to downstream tasks by fine-tuning. Pre-training with ContraCode consistently improves the F1 score of code summarization baselines and top-1 accuracy of type inference baselines by 2% to 13%. ContraCode achieves 9% higher top-1 accuracy than the current state-of-the-art static type analyzer for TypeScript. Finally, representations learned through a hybrid contrastive and reconstruction objective transfer in zero-shot to code clone detection with +10% AUROC over a static text similarity measure and +5% over reconstruction alone.",
+    "title": "Contrastive Code Representation Learning",
+    "authors": [
+      "Paras Jain",
+      "Ajay Jain",
+      "Tianjun Zhang",
+      "Pieter Abbeel",
+      "Joseph E. Gonzalez",
+      "Ion Stoica"
+    ],
+    "emails": [
+      "~Paras_Jain1",
+      "~Ajay_Jain1",
+      "~Joseph_E._Gonzalez1",
+      "~Ion_Stoica1",
+      "~Pieter_Abbeel2",
+      "~Tianjun_Zhang1"
+    ],
+    "rank": 1526
+  },
+  {
+    "url": "https://openreview.net/forum?id=qFQTP00Q0kp",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Self-supervised learning achieves superior performance in many domains by extracting useful representations from the unlabeled data. However, most of traditional self-supervised methods mainly focus on exploring the inter-sample structure while less efforts have been concentrated on the underlying intra-temporal structure, which is important for time series data. In this paper, we present SelfTime: a general self-supervised time series representation learning framework, by exploring the inter-sample relation and intra-temporal relation of time series to learn the underlying structure feature on the unlabeled time series. Specifically, we first generate the inter-sample relation by sampling positive and negative samples of a given anchor sample, and intra-temporal relation by sampling time pieces from this anchor. Then, based on the sampled relation, a shared feature extraction backbone combined with two separate relation reasoning heads are employed to quantify the relationships of the sample pairs for inter-sample relation reasoning, and the relationships of the time piece pairs for intra-temporal relation reasoning, respectively. Finally, the useful representations of time series are extracted from the backbone under the supervision of relation reasoning heads. Experimental results on multiple real-world time series datasets for time series classification task demonstrate the effectiveness of the proposed method. Code and data are publicly available.",
+    "title": "Self-Supervised Time Series Representation Learning by Inter-Intra Relational Reasoning",
+    "authors": [
+      "Haoyi Fan",
+      "Fengbin Zhang",
+      "Yue Gao"
+    ],
+    "emails": [
+      "hrbust.edu.cn",
+      "~Haoyi_Fan1",
+      "~Yue_Gao4"
+    ],
+    "rank": 1527
+  },
+  {
+    "url": "https://openreview.net/forum?id=1hkYtDXAgOZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "The task of temporal action proposal generation (TAPG) aims to provide high-quality video segments, i.e., proposals that potentially contain action events. The performance of tackling the TAPG task heavily depends on two key issues, feature representation and scoring mechanism. To simultaneously take account of both aspects, we introduce an attention-based model, termed as FITS, to address the issues for retrieving high-quality proposals. We first propose a novel Feature-Integration (FI) module to seamlessly fuse two-stream features concerning their interaction to yield a robust video segment representation. We then design a group of Transformer-driven Scorers (TS) to gain the temporal contextual supports over the representations for estimating the starting or ending boundary of an action event. Unlike most previous work to estimate action boundaries without considering the long-range temporal neighborhood, the proposed action-boundary co-estimation mechanism in TS leverages the bi-directional contextual supports for such boundary estimation, which shows the advantage of removing several false-positive boundary predictions. We conduct experiments on two challenging datasets, ActivityNet-1.3 and THUMOS-14. The experimental results demonstrate that the proposed FITS model consistently outperforms state-of-the-art TAPG methods.",
+    "title": "Feature Integration and Group Transformers for Action Proposal Generation",
+    "authors": [
+      "He-Yen Hsieh",
+      "Ding-Jie Chen",
+      "Tung-Ying Lee",
+      "Tyng-Luh Liu"
+    ],
+    "emails": [
+      "berry-ai.com",
+      "~Ding-Jie_Chen1",
+      "~Tyng-Luh_Liu1",
+      "~He-Yen_Hsieh1"
+    ],
+    "rank": 1528
+  },
+  {
+    "url": "https://openreview.net/forum?id=Kzg0XmE6mxu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Learning a distance metric between pairs of examples is widely important for various tasks. Deep Metric Learning (DML) utilizes deep neural network architectures to learn semantic feature embeddings where the distance between similar examples is close and dissimilar examples are far. While the underlying neural networks produce good accuracy on naturally occurring samples, they are vulnerable to adversarially-perturbed samples that can reduce their accuracy. To create robust versions of DML models, we introduce a robust training approach. A key challenge is that metric losses are not independent --- they depend on all samples in a mini-batch.  This sensitivity to samples, if not accounted for, can lead to incorrect robust training.  To the best of our knowledge, we are the first to systematically analyze this dependence effect and propose a principled approach for robust training of deep metric learning networks that accounts for the nuances of metric losses. Using experiments on three popular datasets in metric learning, we demonstrate the DML models trained using our techniques display robustness against strong iterative attacks while their performance on unperturbed (natural) samples remains largely unaffected. ",
+    "title": "Adversarial Deep Metric Learning",
+    "authors": [
+      "Thomas Kobber Panum",
+      "Zi Wang",
+      "Pengyu Kan",
+      "Earlence Fernandes",
+      "Somesh Jha"
+    ],
+    "emails": [
+      "~Somesh_Jha1",
+      "cs.wisc.edu",
+      "~Zi_Wang3",
+      "~Thomas_Kobber_Panum1"
+    ],
+    "rank": 1529
+  },
+  {
+    "url": "https://openreview.net/forum?id=8qsqXlyn-Lp",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Low-dimensional embedding techniques such as tSNE and UMAP allow visualizing high-dimensional data and therewith facilitate the discovery of interesting structure. Although they are widely used, they visualize data as is, rather than in light of the background knowledge we have about the data. What we already know, however, strongly determines what is novel and hence interesting. In this paper we propose two methods for factoring out prior knowledge in the form of distance matrices from low-dimensional embeddings. To factor out prior knowledge from tSNE embeddings, we propose JEDI that adapts the tSNE objective in a principled way using Jensen-Shannon divergence. To factor out prior knowledge from any downstream embedding approach, we propose CONFETTI , in which we directly operate on the input distance matrices. Extensive experiments on both synthetic and real world data show that both methods work well, providing embeddings that exhibit meaningful structure that would otherwise remain hidden.",
+    "title": "Factoring out Prior Knowledge from Low-Dimensional Embeddings",
+    "authors": [
+      "Edith Heiter",
+      "Jonas Fischer",
+      "Jilles Vreeken"
+    ],
+    "emails": [
+      "mmci.uni-saarland.de",
+      "~Jonas_Fischer1",
+      "cispa.de"
+    ],
+    "rank": 1530
+  },
+  {
+    "url": "https://openreview.net/forum?id=2bw8QFtPAZD",
+    "ratings": [
+      5,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.27",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Point cloud is an important 3D data representation widely used in many essential applications. Leveraging deep neural networks, recent works have shown great success in processing 3D point clouds. However, those deep neural networks are vulnerable to various 3D adversarial attacks, which can be summarized as two primary types: point perturbation that affects local point distribution, and surface distortion that causes dramatic changes in geometry. In this paper, we propose a novel 3D adversarial point cloud defense method leveraging implicit function based restoration (IF-Defense) to address both the aforementioned attacks. It is composed of two steps: 1) it predicts an implicit function that captures the clean shape through a surface recovery module, and 2) restores a clean and complete point cloud via minimizing the difference between the attacked point cloud and the predicted implicit function under geometry- and distribution- aware constraints. Our experimental results show that IF-Defense achieves the state-of-the-art defense performance against all existing adversarial attacks on PointNet, PointNet++, DGCNN and PointConv. Comparing with previous methods, IF-Defense presents 20.02% improvement in classification accuracy against salient point dropping attack and 16.29% against LG-GAN attack on PointNet.",
+    "title": "IF-Defense: 3D Adversarial Point Cloud Defense via Implicit Function based Restoration",
+    "authors": [
+      "Ziyi Wu",
+      "Yueqi Duan",
+      "He Wang",
+      "Qingnan Fan",
+      "Leonidas Guibas"
+    ],
+    "emails": [
+      "~Leonidas_Guibas1",
+      "~Ziyi_Wu1",
+      "~He_Wang5",
+      "~Yueqi_Duan1",
+      "~Qingnan_Fan2"
+    ],
+    "rank": 1531
+  },
+  {
+    "url": "https://openreview.net/forum?id=hzkhOUll63",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "We prove new generalization bounds for stochastic gradient descent for both the convex and non-convex case.  Our analysis is based on the stability framework. We analyze stability with respect to the normalized version of the loss function used for training. This leads to investigating a form of angle-wise stability instead of euclidean stability in weights. For neural networks, the measure of distance we consider is invariant to rescaling the weights of each layer.  Furthermore, we exploit the notion of on-average stability in order to obtain a data-dependent quantity in the bound. This data dependent quantity is seen to be more favorable when training with larger learning rates in our numerical experiments.This might help to shed some light on why larger learning rates can lead to better generalization in some practical scenarios.",
+    "title": "Stability analysis of SGD through the normalized loss function",
+    "authors": [
+      "Alexandre Lemire Paquin",
+      "Brahim Chaib-draa",
+      "Philippe Gigu\u00e8re"
+    ],
+    "emails": [
+      "~Brahim_Chaib-draa1",
+      "~Alexandre_Lemire_Paquin1",
+      "~Philippe_Gigu\u00e8re1"
+    ],
+    "rank": 1532
+  },
+  {
+    "url": "https://openreview.net/forum?id=sHSzfA4J7p",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Recent progress in image recognition has stimulated the deployment of vision systems at an unprecedented scale. As a result, visual data are now often consumed not only by humans but also by machines. Existing image processing methods only optimize for better human perception, yet the resulting images may not be accurately recognized by machines. This can be undesirable, e.g., the images can be improperly handled by search engines or recommendation systems. In this work, we propose simple approaches to improve machine interpretability of processed images: optimizing the recognition loss directly on the image processing network or through an intermediate transforming model. Interestingly, the processing model's ability to enhance recognition quality can transfer when evaluated on models of different architectures, recognized categories, tasks and training datasets. This makes the solutions applicable even when we do not have the knowledge of future recognition models, e.g., if we upload processed images to the Internet. We conduct experiments on multiple image processing tasks, with ImageNet classification and PASCAL VOC detection as recognition tasks. With our simple methods, substantial accuracy gain can be achieved with strong transferability and minimal image quality loss. Through a user study we further show that the accuracy gain can transfer to a black-box, third-party cloud model. Finally, we try to explain this transferability phenomenon by demonstrating the similarities of different models' decision boundaries. Code is available at <a href=\"https://github.com/anonymous20202020/Transferable_RA\" target=\"_blank\" rel=\"nofollow\">https://github.com/anonymous20202020/Transferable_RA</a> .",
+    "title": "Transferable Recognition-Aware Image Processing",
+    "authors": [
+      "Zhuang Liu",
+      "Tinghui Zhou",
+      "Hung-Ju Wang",
+      "Zhiqiang Shen",
+      "Bingyi Kang",
+      "Evan Shelhamer",
+      "Trevor Darrell"
+    ],
+    "emails": [
+      "~Bingyi_Kang1",
+      "~Tinghui_Zhou1",
+      "~Hung-Ju_Wang1",
+      "~Zhiqiang_Shen1",
+      "~Zhuang_Liu1",
+      "~Evan_Shelhamer2",
+      "~Trevor_Darrell2"
+    ],
+    "rank": 1533
+  },
+  {
+    "url": "https://openreview.net/forum?id=6s480DdlRQQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Current Deep Neural Network (DNN) backdooring attacks rely on adding static triggers (with fixed patterns and locations) on model inputs that are prone to detection.  In this paper, we propose the first class of dynamic backdooring techniques: Random Backdoor, Backdoor Generating Network (BaN), and conditional Backdoor Generating Network (c-BaN). Triggers generated by our techniques have random patterns and locations. In particular, BaN and c-BaN based on a novel generative network are the first two schemes that algorithmically generate triggers.  Moreover, c-BaN is the first conditional backdooring technique that given a target label, it can generate a target-specific trigger.  Both BaN and c-BaN are essentially a general framework which renders the adversary the flexibility for further customizing backdoor attacks. We extensively evaluate our techniques on three benchmark datasets and show that our techniques achieve almost perfect attack performance on backdoored data with a negligible utility loss. More importantly, our techniques can bypass state-of-the-art defense mechanisms.",
+    "title": "Dynamic Backdoor Attacks Against Deep Neural Networks",
+    "authors": [
+      "Ahmed Salem",
+      "Rui Wen",
+      "Michael Backes",
+      "Shiqing Ma",
+      "Yang Zhang"
+    ],
+    "emails": [
+      "~Yang_Zhang15",
+      "~Ahmed_Salem2",
+      "~Michael_Backes1",
+      "cispa.saarland",
+      "rutgers.edu"
+    ],
+    "rank": 1534
+  },
+  {
+    "url": "https://openreview.net/forum?id=yNFwsrcEtO0",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      5,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Momentum is a popular technique in deep learning for gradient-based optimizers. We propose a decaying momentum (Demon) rule, motivated by decaying the total contribution of a gradient to all future updates. Applying Demon to Adam leads to significantly improved training, notably competitive to momentum SGD with learning rate decay, even in settings in which adaptive methods are typically non-competitive. Similarly, applying Demon to momentum SGD improves over momentum SGD with learning rate decay in most cases. Notably, Demon momentum SGD is observed to be significantly less sensitive to parameter tuning than momentum SGD with learning rate decay schedule, critical to training neural networks in practice. Results are demonstrated across a variety of settings and architectures, including image classification, generative models, and language models. Demon is easy to implement and tune, and incurs limited extra computational overhead, compared to the vanilla counterparts. Code is readily available.",
+    "title": "Demon: Momentum Decay for Improved Neural Network Training",
+    "authors": [
+      "John Chen",
+      "Cameron Wolfe",
+      "Zhao Li",
+      "Anastasios Kyrillidis"
+    ],
+    "emails": [
+      "uth.tmc.edu",
+      "~John_Chen3",
+      "rice.edu",
+      "~Anastasios_Kyrillidis2"
+    ],
+    "rank": 1535
+  },
+  {
+    "url": "https://openreview.net/forum?id=n4IMHNb8_f",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "We consider the problem of spatial path planning. In contrast to the classical solutions which optimize a new plan from scratch and assume access to the full map with ground truth obstacle locations, we learn a planner from the data in a differentiable manner that allows us to leverage statistical regularities from past data. We propose Spatial Planning Transformers (SPT), which given an obstacle map learns to generate actions by planning over long-range spatial dependencies, unlike prior data-driven planners that propagate information locally via convolutional structure in an iterative manner. In the setting where the ground truth map is not known to the agent, we leverage pre-trained SPTs to in an end-to-end framework that has the structure of mapper and planner built into it which allows seamless generalization to out-of-distribution maps and goals. SPTs outperform prior state-of-the-art across all the setups for both manipulation and navigation tasks, leading to an absolute improvement of 7-19%.",
+    "title": "Differentiable Spatial Planning using Transformers",
+    "authors": [
+      "Devendra Singh Chaplot",
+      "Deepak Pathak",
+      "Jitendra Malik"
+    ],
+    "emails": [
+      "~Devendra_Singh_Chaplot2",
+      "~Jitendra_Malik2",
+      "~Deepak_Pathak1"
+    ],
+    "rank": 1536
+  },
+  {
+    "url": "https://openreview.net/forum?id=butEPeLARP_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "  Real-world machine learning systems are often are trained using a mix of data sources with varying cost and quality. Understanding how the size and composition of a training dataset affect model performance is critical for advancing our understanding of generalization, as well as designing more effective data collection policies. We show that there is a simple, accurate way to predict the loss incurred by a model based on data size and composition. Our work expands recent observations of log-linear generalization error and uses this to cast model performance prediction as a learning problem. Using the theory of optimal experimental design, we derive a simple rational function approximation to generalization error that can be fitted using a few model training runs. Our approach achieves nearly exact (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>r</mi><mn>2</mn></msup><mo>&gt;</mo><mn>.93</mn></math></mjx-assistive-mml></mjx-container>) predictions of model performance under substantial extrapolation in two different standard supervised learning tasks and is accurate (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>r</mi><mn>2</mn></msup><mo>&gt;</mo><mn>.83</mn></math></mjx-assistive-mml></mjx-container>) on more challenging machine translation and question answering tasks where baselines achieve worse-than-random performance.",
+    "title": "Predicting the impact of dataset composition on model performance",
+    "authors": [
+      "Tatsunori Hashimoto"
+    ],
+    "emails": [
+      "~Tatsunori_Hashimoto1"
+    ],
+    "rank": 1537
+  },
+  {
+    "url": "https://openreview.net/forum?id=cFpWC6ZMtmj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "As the decisions made or influenced by machine learning models increasingly impact our lives, it is crucial to detect, understand, and mitigate unfairness. But even simply determining what ``unfairness'' should mean in a given context is non-trivial: there are many competing definitions, and choosing between them often requires a deep understanding of the underlying task. It is thus tempting to use model explainability to gain insights into model fairness, however existing explainability tools do not reliably indicate whether a model is indeed fair. In this work we present a new approach to explaining fairness in machine learning, based on the Shapley value paradigm. Our fairness explanations attribute a model's overall unfairness to individual input features, even in cases where the model does not operate on sensitive attributes directly. Moreover, motivated by the linearity of Shapley explainability, we propose a meta algorithm for applying existing training-time fairness interventions, wherein one trains a perturbation to the original model, rather than a new model entirely. By explaining the original model, the perturbation, and the fair-corrected model, we gain insight into the accuracy-fairness trade-off that is being made by the intervention. We further show that this meta algorithm enjoys both flexibility and stability benefits with no loss in performance.",
+    "title": "Explainability for fair machine learning",
+    "authors": [
+      "Tom Begley",
+      "Tobias Schwedes",
+      "Christopher Frye",
+      "Ilya Feige"
+    ],
+    "emails": [
+      "~Christopher_Frye1",
+      "~Ilya_Feige1",
+      "~Tobias_Schwedes1",
+      "~Tom_Begley1"
+    ],
+    "rank": 1538
+  },
+  {
+    "url": "https://openreview.net/forum?id=Sx-mvOvnmJj",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Deep neural networks often perform poorly when training datasets are heavily class-imbalanced. Recently, two-stage methods greatly improve the performances by decoupling representation learning and classifier learning. In this paper, we discover that networks trained on long-tailed datasets are more prone to miscalibrated and over-confident. The two-stage models suffer the same issue as well. We design two novel methods to improve calibration and performance in such scenarios. Motivated by the predicted probability distributions of classes are highly related to the numbers of class instances, we propose a label-aware smoothing to deal with the different degrees of over-confidence for different classes and improve classifier learning. Noting that there is a dataset bias between these two stages because of different samplers, we further propose a shifted batch normalization to solve the dataset bias in the decoupling framework. Through extensive experiments, we also observe that mixup can remedy over-confidence and improve representation learning but has a negative or negligible effect on classifier learning. Our proposed methods set new records on multiple popular long-tailed recognition benchmarks including LT CIFAR 10/100, ImageNet-LT, Places-LT, and iNaturalist 2018.",
+    "title": "Improving Calibration for Long-Tailed Recognition",
+    "authors": [
+      "Zhisheng Zhong",
+      "Jiequan Cui",
+      "Shu Liu",
+      "Jiaya Jia"
+    ],
+    "emails": [
+      "~Jiaya_Jia1",
+      "~Zhisheng_Zhong1",
+      "~Jiequan_Cui1",
+      "~Shu_Liu4"
+    ],
+    "rank": 1539
+  },
+  {
+    "url": "https://openreview.net/forum?id=hbzCPZEIUU",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper considers classification problems with hierarchically organized classes. We force the classifier (hyperplane) of each class to belong to a sphere manifold, whose center is the classifier of its super-class. Then, individual sphere manifolds are connected based on their hierarchical relations. Our technique replaces the last layer of a neural network by combining a spherical fully-connected layer with a hierarchical layer. This regularization is shown to improve the performance of widely used deep neural network architectures (ResNet and DenseNet) on publicly available datasets (CIFAR100, CUB200, Stanford dogs, Stanford cars, and Tiny-ImageNet).",
+    "title": "Connecting Sphere Manifolds Hierarchically for Regularization",
+    "authors": [
+      "Damien Scieur",
+      "Youngsung Kim"
+    ],
+    "emails": [
+      "~Youngsung_Kim3",
+      "~Damien_Scieur3"
+    ],
+    "rank": 1540
+  },
+  {
+    "url": "https://openreview.net/forum?id=V8YXffoDUSa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recent work has suggested that feedforward residual neural networks (ResNets) approximate iterative recurrent computations. Iterative computations are useful in many domains, so they might provide good solutions for neural networks to learn. Here we quantify the degree to which ResNets learn iterative solutions and introduce a regularization approach that encourages learning of iterative solutions. Iterative methods are characterized by two properties: iteration and convergence. To quantify these properties, we define three indices of iterative convergence. Consistent with previous work, we show that, even though ResNets can express iterative solutions, they do not learn them when trained conventionally on computer vision tasks. We then introduce regularizations to encourage iterative convergent computation and test whether this provides a useful inductive bias. To make the networks more iterative, we manipulate the degree of weight sharing across layers using soft gradient coupling. This new method provides a form of recurrence regularization and can interpolate smoothly between an ordinary ResNet and a ``recurrent\" ResNet (i.e., one that uses identical weights across layers and thus could be physically implemented with a recurrent network computing the successive stages iteratively across time). To make the networks more convergent we impose a Lipschitz constraint on the residual functions using spectral normalization. The three indices of iterative convergence reveal that the gradient coupling and the Lipschitz constraint succeed at making the networks iterative and convergent, respectively. However, neither recurrence regularization nor spectral normalization improve classification accuracy on standard visual recognition tasks (MNIST, CIFAR-10, CIFAR-100) or on challenging recognition tasks with partial occlusions (Digitclutter). Iterative convergent computation, in these tasks, does not provide a useful inductive bias for ResNets.",
+    "title": "Iterative convergent computation is not a useful inductive bias for ResNets",
+    "authors": [
+      "Samuel Lippl",
+      "Benjamin Peters",
+      "Nikolaus Kriegeskorte"
+    ],
+    "emails": [
+      "~Nikolaus_Kriegeskorte3",
+      "columbia.edu",
+      "~Samuel_Lippl1"
+    ],
+    "rank": 1541
+  },
+  {
+    "url": "https://openreview.net/forum?id=jk1094_ZiN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      1,
+      3,
+      4
+    ],
+    "abstract": "Neural Architecture Search (NAS) explores a large space of architectural motifs --  \na compute-intensive process that often involves ground-truth evaluation of each motif by instantiating it within a large network, and training and evaluating the network with thousands or more data samples. Inspired by how biological motifs such as cells are sometimes extracted from their natural environment and studied in an artificial Petri dish setting, this paper proposes the Synthetic Petri Dish model for evaluating architectural motifs. In the Synthetic Petri Dish, architectural motifs are instantiated in very small networks and evaluated using very few learned synthetic data samples (to effectively approximate performance in the full problem). The relative performance of motifs in the Synthetic Petri Dish can substitute for their ground-truth performance, thus accelerating the most expensive step of NAS. Unlike other neural network-based prediction models that parse the structure of the motif to estimate its performance, the Synthetic Petri Dish predicts motif performance by training the actual motif in an artificial setting, thus deriving predictions from its true intrinsic properties. Experiments in this paper demonstrate that the Synthetic Petri Dish can therefore predict the performance of new motifs with significantly higher accuracy, especially when insufficient ground truth data is available.\nOur hope is that this work can inspire a new research direction in studying the performance of extracted components of models in a synthetic diagnostic setting optimized to provide informative evaluations.",
+    "title": "Synthetic Petri Dish: A Novel Surrogate Model for Rapid Architecture Search",
+    "authors": [
+      "Aditya Rawal",
+      "Joel Lehman",
+      "Felipe Petroski Such",
+      "Jeff Clune",
+      "Kenneth Stanley"
+    ],
+    "emails": [
+      "~Felipe_Petroski_Such1",
+      "~Kenneth_Stanley1",
+      "~Jeff_Clune1",
+      "~Aditya_Rawal1",
+      "~Joel_Lehman1"
+    ],
+    "rank": 1542
+  },
+  {
+    "url": "https://openreview.net/forum?id=WlT94P_zuHF",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Transformer networks have shown outstanding performance on many natural language processing tasks. However the context length (the number of previous tokens on which the output states depend) of a Transformer network grows at best linearly with the memory and computational power used. This limitation prevents a transformer network to have very long context in a resource limited application. In this work, we propose a class of transformer networks, namely Transformer-QL (\\bf{Q}uadratically \\bf{L}arge), in which, the context length can grow at best quadratically with the memory and computational power used. We have empirically evaluated a Transformer-QL model in three long range language modeling datasets. The results show that Transformer-QL can provide significant improvements over other state of the art networks.",
+    "title": "Transformer-QL: A Step Towards Making Transformer Network Quadratically Large",
+    "authors": [
+      "Suvadeep Hajra"
+    ],
+    "emails": [
+      "~Suvadeep_Hajra1"
+    ],
+    "rank": 1543
+  },
+  {
+    "url": "https://openreview.net/forum?id=QjINdYOfq0b",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "We present Automatic Bit Sharing (ABS) to automatically search for optimal model compression configurations (e.g., pruning ratio and bitwidth). Unlike previous works that consider model pruning and quantization separately, we seek to optimize them jointly. To deal with the resultant large designing space, we propose a novel super-bit model, a single-path method, to encode all candidate compression configurations, rather than maintaining separate paths for each configuration. Specifically, we first propose a novel decomposition of quantization that encapsulates all the candidate bitwidths in the search space. Starting from a low bitwidth, we sequentially consider higher bitwidths by recursively adding re-assignment offsets. We then introduce learnable binary gates to encode the choice of bitwidth, including filter-wise 0-bit for pruning. By jointly training the binary gates in conjunction with network parameters, the compression configurations of each layer can be automatically determined. Our ABS brings two benefits for model compression: 1) It avoids the combinatorially large design space, with a reduced number of trainable parameters and search costs. 2) It also averts directly fitting an extremely low bit quantizer to the data, hence greatly reducing the optimization difficulty due to the non-differentiable quantization. Experiments on CIFAR-100 and ImageNet show that our methods achieve significant computational cost reduction while preserving promising performance. ",
+    "title": "ABS: Automatic Bit Sharing for Model Compression",
+    "authors": [
+      "Jing Liu",
+      "Bohan Zhuang",
+      "Peng Chen",
+      "Yong Guo",
+      "Chunhua Shen",
+      "Jianfei Cai",
+      "Mingkui Tan"
+    ],
+    "emails": [
+      "~Jing_Liu8",
+      "~Chunhua_Shen1",
+      "~Peng_Chen2",
+      "~Jianfei_Cai1",
+      "~Yong_Guo1",
+      "~Mingkui_Tan2",
+      "~Bohan_Zhuang1"
+    ],
+    "rank": 1544
+  },
+  {
+    "url": "https://openreview.net/forum?id=Uh0T_Q0pg7r",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Deep learning models such as Convolutional Neural Networks (CNNs) have demonstrated high levels of effectiveness in a variety of domains, including computer vision and more recently, computational biology. However, training effective models often requires assembling and/or labeling large datasets, which may be prohibitively time-consuming or costly. Pool-based active learning techniques have the potential to mitigate these issues, leveraging models trained on limited data to selectively query unlabeled data points from a pool in an attempt to expedite the learning process. Here we present \"Dropout-based Expected IMprOvementS\" (DEIMOS), a flexible and computationally-efficient approach to active learning that queries points that are expected to maximize the model's improvement across a representative sample of points. The proposed framework enables us to maintain a prediction covariance matrix capturing model uncertainty, and to dynamically update this matrix in order to generate diverse batches of points in the batch-mode setting. Our active learning results demonstrate that DEIMOS outperforms several existing baselines across multiple regression and classification tasks taken from computer vision and genomics.",
+    "title": "Active Learning in CNNs via Expected Improvement Maximization",
+    "authors": [
+      "Udai G. Nagpal",
+      "David A. Knowles"
+    ],
+    "emails": [
+      "~Udai_G._Nagpal1",
+      "~David_A._Knowles1"
+    ],
+    "rank": 1545
+  },
+  {
+    "url": "https://openreview.net/forum?id=JU8ceIgm5xB",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.27",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Many self-supervised representation learning methods maximize mutual information (MI) across views. In this paper, we transform each view into a set of subviews and then decompose the original MI bound into a sum of bounds involving conditional MI between the subviews. E.g.,~given two views <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi></math></mjx-assistive-mml></mjx-container> of the same input example, we can split <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi></math></mjx-assistive-mml></mjx-container> into two subviews, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>, which depend only on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi></math></mjx-assistive-mml></mjx-container> but are otherwise unconstrained. The following holds: <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D43C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D43C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>I</mi><mo stretchy=\"false\">(</mo><mi>x</mi><mo>;</mo><mi>y</mi><mo stretchy=\"false\">)</mo><mo>\u2265</mo><mi>I</mi><mo stretchy=\"false\">(</mo><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup><mo>;</mo><mi>y</mi><mo stretchy=\"false\">)</mo><mo>+</mo><mi>I</mi><mo stretchy=\"false\">(</mo><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup><mo>;</mo><mi>y</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, due to the chain rule and information processing inequality. By maximizing both terms in the decomposition, our approach explicitly rewards the encoder for any information about <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi></math></mjx-assistive-mml></mjx-container> which it extracts from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>, and for information about <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi></math></mjx-assistive-mml></mjx-container> extracted from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> in excess of the information from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>. We provide a novel contrastive lower-bound on conditional MI, that relies on sampling contrast sets from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo stretchy=\"false\">(</mo><mi>y</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msup><mi>x</mi><mrow><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi><mi data-mjx-alternate=\"1\" mathvariant=\"normal\">\u2032</mi></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. By decomposing the original MI into a sum of increasingly challenging MI bounds between sets of increasingly informed views, our representations can capture more of the total information shared between the original views. We empirically test the method in a vision domain and for dialogue generation.",
+    "title": "Decomposing Mutual Information for Representation Learning",
+    "authors": [
+      "Alessandro Sordoni",
+      "Nouha Dziri",
+      "Hannes Schulz",
+      "Geoff Gordon",
+      "Remi Tachet des Combes",
+      "Philip Bachman"
+    ],
+    "emails": [
+      "~Alessandro_Sordoni2",
+      "~Philip_Bachman1",
+      "~Hannes_Schulz1",
+      "ualberta.ca",
+      "~Geoff_Gordon2",
+      "~Remi_Tachet_des_Combes1"
+    ],
+    "rank": 1546
+  },
+  {
+    "url": "https://openreview.net/forum?id=LpSGtq6F5xN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this study, we propose a deep clustering algorithm that utilizes a variational autoencoder (VAE) framework with a multi encoder-decoder neural architecture. This setup enforces a complementary structure that guides the learned latent representations towards a more meaningful space arrangement. It differs from previous VAE-based clustering algorithms by employing a new generative model that uses multiple encoder-decoders.\nWe show that this modeling results in both better clustering capabilities and improved data generation. The proposed method is evaluated on standard datasets and is shown to outperform state-of-the-art deep clustering methods significantly.",
+    "title": "A Mixture of Variational Autoencoders for Deep Clustering",
+    "authors": [
+      "Avi Caciularu",
+      "Jacob Goldberger"
+    ],
+    "emails": [
+      "~Avi_Caciularu1",
+      "~Jacob_Goldberger1"
+    ],
+    "rank": 1547
+  },
+  {
+    "url": "https://openreview.net/forum?id=MpStQoD73Mj",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.26",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "We introduce a framework for automatic differentiation with weighted finite-state transducers (WFSTs) allowing them to be used dynamically at training time. Through the separation of graphs from operations on graphs, this framework enables the exploration of new structured loss functions which in turn eases the encoding of prior knowledge into learning algorithms. We show how the framework can combine pruning and back-off in transition models with various sequence-level loss functions. We also show how to learn over the latent decomposition of phrases into word pieces. Finally, to demonstrate that WFSTs can be used in the interior of a deep neural network, we propose a convolutional WFST layer which maps lower-level representations to higher-level representations and can be used as a drop-in replacement for a traditional convolution. We validate these algorithms with experiments in handwriting recognition and speech recognition.",
+    "title": "Differentiable Weighted Finite-State Transducers",
+    "authors": [
+      "Awni Hannun",
+      "Vineel Pratap",
+      "Jacob Kahn",
+      "Wei-Ning Hsu"
+    ],
+    "emails": [
+      "fb.com",
+      "~Awni_Hannun1",
+      "~Jacob_Kahn1",
+      "~Wei-Ning_Hsu2"
+    ],
+    "rank": 1548
+  },
+  {
+    "url": "https://openreview.net/forum?id=412_KkkGjJ4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      5
+    ],
+    "rating": "5.26",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": " Recent researches have achieved substantial advances in learning structured representations from images. However, current methods rely heavily on the annotated mapping between the nodes of scene graphs and object bounding boxes inside images. Here, we explore the problem of learning the mapping between scene graph nodes and visual objects under weak supervision. Our proposed method learns a metric among visual objects and scene graph nodes by incorporating information from both object features and relational features. Extensive experiments on Visual Genome (VG) and Visual Relation Detection (VRD) datasets verify that our model post an improvement on scene graph grounding task over current state-of-the-art approaches. Further experiments on scene graph parsing task verify the grounding found by our model can reinforce the performance of the existing method. ",
+    "title": "Weakly Supervised Scene Graph Grounding",
+    "authors": [
+      "Yizhou Zhang",
+      "Zhaoheng Zheng",
+      "Yan Liu"
+    ],
+    "emails": [
+      "~Yan_Liu1",
+      "~Zhaoheng_Zheng1",
+      "~Yizhou_Zhang3"
+    ],
+    "rank": 1549
+  },
+  {
+    "url": "https://openreview.net/forum?id=RVANVvSi8MZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.26",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Line graphs have shown to be effective in improving feature learning in graph neural networks. Line graphs can encode topology information of their original graphs and provide a complementary representational perspective. In this work, we show that the encoded information in line graphs is biased. To overcome this issue, we propose a weighted line graph that corrects biases in line graphs by assigning normalized weights to edges. Based on our weighted line graphs, we develop a weighted line graph convolution layer that takes advantage of line graph structures for better feature learning. In particular, it performs message passing operations on both the original graph and its corresponding weighted line graph. To address efficiency issues in line graph neural networks, we propose to use an incidence matrix to accurately compute the adjacency matrix of the weighted line graph, leading to dramatic reductions in computational resource usage. Experimental results on both real and simulated datasets demonstrate the effectiveness and efficiency of our proposed methods. ",
+    "title": "Weighted Line Graph Convolutional Networks",
+    "authors": [
+      "Hongyang Gao",
+      "Shuiwang Ji"
+    ],
+    "emails": [
+      "~Hongyang_Gao1",
+      "~Shuiwang_Ji1"
+    ],
+    "rank": 1550
+  },
+  {
+    "url": "https://openreview.net/forum?id=FyucNzzMba-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.26",
+    "confidences": [
+      3,
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Physical reasoning requires forward prediction: the ability to forecast what will happen next given some initial world state. We study the performance of state-of-the-art forward-prediction models in the complex physical-reasoning tasks of the PHYRE benchmark (Bakhtin et al., 2019). We do so by incorporating models that operate on object or pixel-based representations of the world into simple physical-reasoning agents. We find that forward-prediction models can improve physical-reasoning performance, particularly on complex tasks that involve many objects. However, we also find that these improvements are contingent on the test tasks being small variations of train tasks, and that generalization to completely new task templates is challenging. Surprisingly, we observe that forward predictors with better pixel accuracy do not necessarily lead to better physical-reasoning performance. Nevertheless, our best models set a new state-of-the-art on the PHYRE benchmark.",
+    "title": "Forward Prediction for Physical Reasoning",
+    "authors": [
+      "Rohit Girdhar",
+      "Laura Gustafson",
+      "Aaron B. Adcock",
+      "Laurens van der Maaten"
+    ],
+    "emails": [
+      "~Rohit_Girdhar5",
+      "~Laurens_van_der_Maaten3",
+      "~Aaron_B._Adcock1",
+      "~Laura_Gustafson1"
+    ],
+    "rank": 1551
+  },
+  {
+    "url": "https://openreview.net/forum?id=ioXEbG_Sf-a",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      3
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The use of past experiences to accelerate temporal difference (TD) learning of value functions, or experience replay, is a key component in deep reinforcement learning. In this work, we propose to reweight experiences based on their likelihood under the stationary distribution of the current policy, and justify this with a contraction argument over the Bellman evaluation operator. The resulting TD objective encourages small approximation errors on the value function over frequently encountered states. To balance bias and variance in practice, we use a likelihood-free density ratio estimator between on-policy and off-policy experiences, and use the ratios as the prioritization weights. We apply the proposed approach empirically on three competitive methods, Soft Actor Critic (SAC), Twin Delayed Deep Deterministic policy gradient (TD3) and Data-regularized Q (DrQ), over 11 tasks from OpenAI gym and DeepMind control suite. We achieve superior sample complexity on 35 out of 45 method-task combinations compared to the best baseline and similar sample complexity on the remaining 10. ",
+    "title": "Experience Replay with Likelihood-free Importance Weights",
+    "authors": [
+      "Samarth Sinha",
+      "Jiaming Song",
+      "Animesh Garg",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Stefano_Ermon1",
+      "~Animesh_Garg1",
+      "~Samarth_Sinha1",
+      "~Jiaming_Song1"
+    ],
+    "rank": 1552
+  },
+  {
+    "url": "https://openreview.net/forum?id=yOkmUBv9ed",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "  Transformer-based encoder models have achieved promising results on natural language processing (NLP) tasks including question answering (QA).  Different from sequence classification or language modeling tasks, hidden states at all positions are used for the final classification in QA.  However, we do not always need all the context to answer the raised question.  Following this idea, we proposed Block Skim Transformer (BST) to improve and accelerate the processing of transformer QA models.  The key idea of BST is to identify the context that must be further processed and the blocks that could be safely discarded early on during inference. Critically, we learn such information from self-attention weights.  As a result, the model hidden states are pruned at the sequence dimension, achieving significant inference speedup.  We also show that such extra training optimization objection also improves model accuracy. As a plugin to the transformer-based QA models, BST is compatible with other model compression methods without changing existing network architectures.  BST improves QA models' accuracies on different datasets and achieves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.6</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> speedup on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>B</mi><mi>E</mi><mi>R</mi><msub><mi>T</mi><mrow><mi>l</mi><mi>a</mi><mi>r</mi><mi>g</mi><mi>e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> model.\n",
+    "title": "Block Skim Transformer for Efficient Question Answering",
+    "authors": [
+      "Yue Guan",
+      "Jingwen Leng",
+      "Yuhao Zhu",
+      "Minyi Guo"
+    ],
+    "emails": [
+      "~Minyi_Guo1",
+      "~Yue_Guan2",
+      "~Jingwen_Leng1",
+      "~Yuhao_Zhu1"
+    ],
+    "rank": 1553
+  },
+  {
+    "url": "https://openreview.net/forum?id=Jq8JGA89sDa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural sequence models can generate highly fluent sentences but recent studies have also shown that they are also prone to hallucinate additional content not supported by the input, which can cause a lack of trust in the model.\nTo better assess the faithfulness of the machine outputs, we propose a new task to predict whether each token in the output sequence is hallucinated conditioned on the source input, and collect new manually annotated evaluation sets for this task.\nWe also introduce a novel method for learning to model hallucination detection, based on pretrained language models fine tuned on synthetic data that includes automatically inserted hallucinations.  \nExperiments on machine translation and abstract text summarization demonstrate the effectiveness of our proposed approach -- we obtain an average F1 of around 0.6 across all the benchmark datasets.\nFurthermore, we demonstrate how to use the token-level hallucination labels to define a fine-grained loss over the target sequence in the low-resource machine translation and achieve significant improvements over strong baseline methods.\nWe will also release our annotated data and code for future research.",
+    "title": "Detecting Hallucinated Content in Conditional Neural Sequence Generation",
+    "authors": [
+      "Chunting Zhou",
+      "Jiatao Gu",
+      "Mona T. Diab",
+      "Paco Guzm\u00e1n",
+      "Luke Zettlemoyer",
+      "Marjan Ghazvininejad"
+    ],
+    "emails": [
+      "~Jiatao_Gu1",
+      "~Marjan_Ghazvininejad1",
+      "~Mona_T._Diab1",
+      "fb.com",
+      "~Luke_Zettlemoyer1",
+      "~Chunting_Zhou1"
+    ],
+    "rank": 1554
+  },
+  {
+    "url": "https://openreview.net/forum?id=dlEJsyHGeaL",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      3,
+      6,
+      7,
+      7
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      5,
+      2,
+      1
+    ],
+    "abstract": "While graph neural networks have made impressive progress in classification and regression, few approaches to date perform time series prediction on graphs, and those that do are mostly limited to edge changes. We suggest that graph edits are a more natural interface for graph-to-graph learning. In particular,  graph edits are general enough to describe any graph-to-graph change, not only edge changes; they are sparse, making them easier to understand for humans and more efficient computationally; and they are local, avoiding the need for pooling layers in graph neural networks. In this paper, we propose a novel output layer - the graph edit network - which takes node embeddings as input and generates a sequence of graph edits that transform the input graph to the output graph. We prove that a mapping between the node sets of two graphs is sufficient to construct training data for a graph edit network and that an optimal mapping yields edit scripts that are almost as short as the graph edit distance between the graphs. We further provide a proof-of-concept empirical evaluation on several graph dynamical systems, which are difficult to learn for baselines from the literature.",
+    "title": "Graph Edit Networks",
+    "authors": [
+      "Benjamin Paassen",
+      "Daniele Grattarola",
+      "Daniele Zambon",
+      "Cesare Alippi",
+      "Barbara Eva Hammer"
+    ],
+    "emails": [
+      "usi.ch",
+      "~Barbara_Eva_Hammer1",
+      "~Daniele_Zambon1",
+      "~Daniele_Grattarola1",
+      "~Benjamin_Paassen1"
+    ],
+    "rank": 1555
+  },
+  {
+    "url": "https://openreview.net/forum?id=sfgcqgOm2F_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Modern deep learning models are often trained in parallel over a collection of distributed machines to reduce training time. In such settings, communication of model updates among machines becomes a significant performance bottleneck and various lossy update compression techniques have been proposed to alleviate this problem. In this work, we introduce a new, simple yet theoretically and practically effective compression technique: {\\em natural compression (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>C</mi><mrow><mi>n</mi><mi>a</mi><mi>t</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>)}. Our technique is applied individually to all entries of the to-be-compressed update vector and works by randomized rounding to the nearest (negative or positive) power of two, which can be computed in a ``natural'' way by ignoring the mantissa. We show that compared to no compression,  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>C</mi><mrow><mi>n</mi><mi>a</mi><mi>t</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> increases the second moment of the compressed vector by not more than the tiny factor  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-n\" style=\"color: red;\"><mjx-c class=\"mjx-c5C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c63\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c><mjx-c class=\"mjx-c66\"></mjx-c><mjx-c class=\"mjx-c72\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c><mjx-c class=\"mjx-c63\"></mjx-c></mjx-mtext><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn></mjx-texatom><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathcolor=\"red\">\\nicefrac</mtext><mrow><mn>9</mn></mrow><mrow><mn>8</mn></mrow></math></mjx-assistive-mml></mjx-container>, which means that the effect of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>C</mi><mrow><mi>n</mi><mi>a</mi><mi>t</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> on the convergence speed of popular training algorithms, such as distributed SGD, is negligible.  However, the communications savings enabled by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>C</mi><mrow><mi>n</mi><mi>a</mi><mi>t</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> are substantial,  \nleading to {\\em <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn></math></mjx-assistive-mml></mjx-container>-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>4</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> improvement in overall theoretical running time}. For applications requiring more aggressive compression, we generalize <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>C</mi><mrow><mi>n</mi><mi>a</mi><mi>t</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> to {\\em natural dithering}, which we prove is {\\em exponentially better} than the common random dithering technique. Our compression operators can be used on their own or in combination with existing operators for a more aggressive combined effect, and offer new state-of-the-art both in theory and practice.",
+    "title": "Natural Compression  for Distributed Deep Learning",
+    "authors": [
+      "Samuel Horv\u00e1th",
+      "Chen-Yu Ho",
+      "Ludovit Horv\u00e1th",
+      "Atal Narayan Sahu",
+      "Marco Canini",
+      "Peter Richtarik"
+    ],
+    "emails": [
+      "~Marco_Canini1",
+      "kaust.edu.sa",
+      "gmail.com",
+      "~Samuel_Horv\u00e1th1",
+      "~Peter_Richtarik1"
+    ],
+    "rank": 1556
+  },
+  {
+    "url": "https://openreview.net/forum?id=9_J4DrgC_db",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      3,
+      3,
+      2,
+      4
+    ],
+    "abstract": "In policy search methods for reinforcement learning (RL), exploration is often performed by injecting noise either in action space at each step independently or in parameter space over each full trajectory. In prior work, it has been shown that with linear policies, a more balanced trade-off between these two exploration strategies is beneficial. However, that method did not scale to policies using deep neural networks. In this paper, we introduce Deep Coherent Exploration, a general and scalable exploration framework for deep RL algorithms on continuous control, that generalizes step-based and trajectory-based exploration. This framework models the last layer parameters of the policy network as latent variables and uses a recursive inference step within the policy update to handle these latent variables in a scalable manner. We find that Deep Coherent Exploration improves the speed and stability of learning of A2C, PPO, and SAC on several continuous control tasks.",
+    "title": "Deep Coherent Exploration For Continuous Control",
+    "authors": [
+      "Yijie Zhang",
+      "Herke van Hoof"
+    ],
+    "emails": [
+      "~Herke_van_Hoof4",
+      "~Yijie_Zhang1"
+    ],
+    "rank": 1557
+  },
+  {
+    "url": "https://openreview.net/forum?id=Srmggo3b3X6",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      7,
+      4,
+      3
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We prove a new upper bound on the generalization gap of classifiers that are obtained by first using self-supervision to learn a representation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>r</mi></math></mjx-assistive-mml></mjx-container> of the training~data, and then fitting a simple (e.g., linear) classifier <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>g</mi></math></mjx-assistive-mml></mjx-container> to the labels. Specifically, we show that (under the assumptions described below) the generalization gap of such classifiers tends to zero if <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5A2 TEX-SS\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c226A\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"sans-serif\">C</mi></mrow><mo stretchy=\"false\">(</mo><mi>g</mi><mo stretchy=\"false\">)</mo><mo>\u226a</mo><mi>n</mi></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5A2 TEX-SS\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"sans-serif\">C</mi></mrow><mo stretchy=\"false\">(</mo><mi>g</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> is an appropriately-defined measure of the simple classifier <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>g</mi></math></mjx-assistive-mml></mjx-container>'s complexity, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the number of training samples. We stress that our bound is independent of the complexity of the representation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>r</mi></math></mjx-assistive-mml></mjx-container>. \nWe do not make any structural or conditional-independence  assumptions on the representation-learning task, which can use the same training dataset that is later used for classification. Rather, we assume that the training procedure satisfies certain natural noise-robustness (adding small amount of label noise causes small degradation in performance) and rationality  (getting the wrong label is not better than getting no label at all) conditions that widely hold across many standard architectures.\nWe also conduct an extensive empirical study of the generalization gap and the quantities used in our assumptions for a variety of self-supervision based algorithms, including SimCLR, AMDIM and BigBiGAN,  on the CIFAR-10 and ImageNet datasets. We show that, unlike standard supervised classifiers, these algorithms display small generalization gap, and the bounds we prove on this gap are often non vacuous. ",
+    "title": "For self-supervised learning, Rationality implies generalization, provably",
+    "authors": [
+      "Yamini Bansal",
+      "Gal Kaplun",
+      "Boaz Barak"
+    ],
+    "emails": [
+      "~Boaz_Barak2",
+      "~Gal_Kaplun1",
+      "~Yamini_Bansal1"
+    ],
+    "rank": 1558
+  },
+  {
+    "url": "https://openreview.net/forum?id=yu8JOcFCFrE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we propose a novel framework for Deep Clustering and multimanifold Representation Learning (DCRL) that preserves the geometric structure of data. In the proposed DCRL framework, manifold clustering is done in the latent space guided by a clustering loss. To overcome the problem that clusteringoriented losses may deteriorate the geometric structure of embeddings in the latent space, an isometric loss is proposed for preserving intra-manifold structure locally and a ranking loss for inter-manifold structure globally. Experimental results on various datasets show that the DCRL framework leads to performances comparable to current state-of-the-art deep clustering algorithms, yet exhibits superior performance for manifold representation. Our results also demonstrate the importance and effectiveness of the proposed losses in preserving geometric structure in terms of visualization and performance metrics. The code is provided in the Supplementary Material.",
+    "title": "Deep Clustering and Representation Learning that Preserves Geometric Structures",
+    "authors": [
+      "Lirong Wu",
+      "Zicheng Liu",
+      "Zelin Zang",
+      "Jun Xia",
+      "Siyuan Li",
+      "Stan Z. Li"
+    ],
+    "emails": [
+      "~Zelin_Zang2",
+      "~Lirong_Wu1",
+      "~Stan_Z._Li2",
+      "~Jun_Xia1",
+      "~Siyuan_Li6",
+      "~Zicheng_Liu2"
+    ],
+    "rank": 1559
+  },
+  {
+    "url": "https://openreview.net/forum?id=J5LS3YJH7Zi",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Dynamic latent variable modelling has been a hugely powerful tool in understanding how spiking activity in populations of neurons can perform computations necessary for adaptive behaviour. The success of such approaches has been enabled by the ability to construct models derived with the characterization of spiking activity as point-processes since spiking dynamics occur on a much faster time-scale than the computational dynamics being inferred. Other experimental techniques, such as calcium imaging, pose a problem for latent variable modelling of computational dynamics, since the time-scales of calcium dynamics and computational dynamics overlap. As such, the success of dynamic latent variable modelling in calcium imaging data rests on being able to disentangle the contribution of these two sources of variation. Here we extend recent advances using variational autoencoders to analyze neural data, by incorporating a ladder architecture that can infer a hierarchy of dynamical systems. Using built-in inductive biases for calcium dynamics, we can capture calcium flux as well as underlying dynamics of neural computation. First, we demonstrate with synthetic calcium data that we can correctly infer an underlying Lorenz attractor at the same time as calcium dynamics. Next, we show that we can infer appropriate rotational dynamics in spiking data from macaque motor cortex after it has been converted into calcium fluorescence data via a calcium dynamics model. Finally, we show that our method applied to real calcium imaging data from primary visual cortex in mice allows us to infer latent factors that carry salient sensory information about unexpected stimuli. These results demonstrate that variational ladder autoencoders are a promising approach for inferring hierarchical dynamics in experimental settings where the measured variable has its own slow dynamics, such as calcium imaging data, thereby providing the neuroscience community with a new analysis tool for a wider array of data modalities.",
+    "title": "CaLFADS: latent factor analysis of dynamical systems in calcium imaging data",
+    "authors": [
+      "Luke Yuri Prince",
+      "Shahab Bakhtiari",
+      "Colleen J Gillon",
+      "Blake Aaron Richards"
+    ],
+    "emails": [
+      "~Shahab_Bakhtiari1",
+      "~Colleen_J_Gillon1",
+      "~Blake_Aaron_Richards1",
+      "~Luke_Yuri_Prince1"
+    ],
+    "rank": 1560
+  },
+  {
+    "url": "https://openreview.net/forum?id=68747kJ0qKt",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "We examine Dropout through the perspective of interactions. Given <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> variables, there are <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> possible pairwise interactions, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mn>3</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> possible 3-way interactions, i.e. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mi>k</mi></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> possible interactions of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> variables. Conversely, the probability of an interaction of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> variables surviving Dropout at rate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container> is <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mo stretchy=\"false\">(</mo><mn>1</mn><mo>\u2212</mo><mi>p</mi><msup><mo stretchy=\"false\">)</mo><mi>k</mi></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. In this paper, we show that these rates cancel, and as a result, Dropout selectively regularizes against learning higher-order interactions. We prove this new perspective analytically for Input Dropout and empirically for Activation Dropout. This perspective on Dropout has several practical implications: (1) higher Dropout rates should be used when we need stronger regularization against spurious high-order interactions, (2) caution must be used when interpreting Dropout-based feature saliency measures, and (3) networks trained with Input Dropout are biased estimators, even with infinite data. We also compare Dropout to regularization via weight decay and early stopping and find that it is difficult to obtain the same regularization against high-order interactions with these methods.",
+    "title": "On Dropout, Overfitting, and Interaction Effects in Deep Neural Networks",
+    "authors": [
+      "Ben Lengerich",
+      "Eric Xing",
+      "Rich Caruana"
+    ],
+    "emails": [
+      "~Ben_Lengerich1",
+      "~Eric_Xing1",
+      "~Rich_Caruana1"
+    ],
+    "rank": 1561
+  },
+  {
+    "url": "https://openreview.net/forum?id=vOchfRdvPy7",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Adversarial training algorithms have been proven to be reliable to improve machine learning models' robustness against adversarial examples. However, we find that adversarial training algorithms tend to introduce severe disparity of accuracy and robustness between different groups of data. For instance, PGD adversarially trained ResNet18 model on CIFAR-10 has 93% clean accuracy and 67% PGD <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>-8 adversarial accuracy on the class ''automobile'' but only 59% and 17% on class ''cat''. This phenomenon happens in balanced datasets and does not exist in naturally trained models when only using clean samples. In this work, we theoretically show that this phenomenon can generally happen under adversarial training algorithms which minimize DNN models' robust errors. Motivated by these findings, we propose a Fair-Robust-Learning (FRL) framework to mitigate this unfairness problem when doing adversarial defenses and experimental results validate the effectiveness of FRL.",
+    "title": "To be Robust or to be Fair: Towards Fairness in Adversarial Training",
+    "authors": [
+      "Han Xu",
+      "Xiaorui Liu",
+      "Yaxin Li",
+      "Jiliang Tang"
+    ],
+    "emails": [
+      "~Xiaorui_Liu1",
+      "~Jiliang_Tang1",
+      "~Han_Xu1",
+      "msu.edu"
+    ],
+    "rank": 1562
+  },
+  {
+    "url": "https://openreview.net/forum?id=e68IYJNOYau",
+    "ratings": [
+      4,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": " In this paper, we propose a new framework for point cloud instance segmentation. Our framework has two steps: an embedding step and a clustering step. In the embedding step, our main contribution is to propose a probabilistic embedding space for point cloud embedding. Specifically, each point is represented as a tri-variate normal distribution. In the clustering step, we propose a novel loss function,   which benefits both the semantic segmentation and the clustering. Our experimental results show important improvements to the SOTA, i.e., 3.1\\% increased average per-category mAP on the PartNet dataset.",
+    "title": "Point Cloud Instance Segmentation using Probabilistic Embeddings",
+    "authors": [
+      "Biao Zhang",
+      "Peter Wonka"
+    ],
+    "emails": [
+      "~Peter_Wonka1",
+      "~Biao_Zhang5"
+    ],
+    "rank": 1563
+  },
+  {
+    "url": "https://openreview.net/forum?id=KOtxfjpQsq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Model-based reinforcement learning (MBRL) has been applied to meta-learning settings and has demonstrated its high sample efficiency. \nHowever, in previous MBRL for meta-learning settings, policies are optimized via rollouts that fully rely on a predictive model of an environment. \nThus, its performance in a real environment tends to degrade when the predictive model is inaccurate. \nIn this paper, we prove that performance degradation can be suppressed by using branched meta-rollouts. \nOn the basis of this theoretical analysis, we propose Meta-Model-based Meta-Policy Optimization (M3PO), in which the branched meta-rollouts are used for policy optimization. \nWe demonstrate that M3PO outperforms existing meta reinforcement learning methods in continuous-control benchmarks. ",
+    "title": "Meta-Model-Based Meta-Policy Optimization",
+    "authors": [
+      "Takuya Hiraoka",
+      "Takahisa Imagawa",
+      "Voot Tangkaratt",
+      "Takayuki Osa",
+      "Takashi Onishi",
+      "Yoshimasa Tsuruoka"
+    ],
+    "emails": [
+      "~Voot_Tangkaratt1",
+      "nec.com",
+      "~Takahisa_Imagawa1",
+      "~Yoshimasa_Tsuruoka1",
+      "~Takayuki_Osa1"
+    ],
+    "rank": 1564
+  },
+  {
+    "url": "https://openreview.net/forum?id=Dtahsj2FkrK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      2,
+      2
+    ],
+    "abstract": "A/B testing, or online experiment is a standard business strategy to compare a new product with an old one in pharmaceutical, technological, and traditional industries. The aim of this paper is to introduce a reinforcement learn- ing framework for carrying A/B testing in two-sided marketplace platforms, while characterizing the long-term treatment effects. Our proposed testing procedure allows for sequential monitoring and online updating. It is generally applicable to a variety of treatment designs in different industries. In addition, we systematically investigate the theoretical properties (e.g., size and power) of our testing procedure. Finally, we apply our framework to both synthetic data and a real-world data example obtained from a technological company to illustrate its advantage over the current practice. \n",
+    "title": "A REINFORCEMENT LEARNING FRAMEWORK FOR TIME DEPENDENT CAUSAL EFFECTS EVALUATION IN A/B TESTING",
+    "authors": [
+      "Chengchun Shi",
+      "Xiaoyu Wang",
+      "Shikai Luo",
+      "Rui Song",
+      "Hongtu Zhu",
+      "Jieping Ye"
+    ],
+    "emails": [
+      "didiglobal.com",
+      "~Jieping_Ye3",
+      "~Chengchun_Shi1",
+      "gmail.com",
+      "~Rui_Song2",
+      "~Hongtu_Zhu2"
+    ],
+    "rank": 1565
+  },
+  {
+    "url": "https://openreview.net/forum?id=tckGH8K9y6o",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Leveraging the framework of Optimal Transport, we introduce a new family of generative autoencoders with a learnable prior, called Symmetric Wasserstein Autoencoders (SWAEs). We propose to symmetrically match the joint distributions of the observed data and the latent representation induced by the encoder and the decoder. The resultant algorithm jointly optimizes the modelling losses in both the data and the latent spaces with  the loss in the data space leading to the denoising effect. With the symmetric treatment of the data and the latent representation, the algorithm implicitly preserves the local structure of the data in the latent space. To further improve the latent representation, we incorporate a reconstruction loss into the objective, which significantly benefits both the generation and reconstruction. We empirically show the superior performance of SWAEs over several state-of-the-art generative autoencoders in terms of classification, reconstruction, and generation.",
+    "title": "Symmetric Wasserstein Autoencoders",
+    "authors": [
+      "Sun Sun",
+      "Hongyu Guo"
+    ],
+    "emails": [
+      "~Hongyu_Guo1",
+      "~Sun_Sun1"
+    ],
+    "rank": 1566
+  },
+  {
+    "url": "https://openreview.net/forum?id=1ibNKMp8SKc",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      6
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Despite impressive progress in the last decade, it still remains an open challenge to build models that generalize well across multiple tasks and datasets. One path to achieve this is to learn meaningful and compact representations, in which different semantic aspects of data are structurally disentangled. The focus of disentanglement approaches has been on separating independent factors of variation despite the fact that real-world observations are often not structured into meaningful independent causal variables. In this work, we bridge the gap to real-world scenarios by analyzing the behavior of most prominent methods and disentanglement scores on correlated data in a large scale empirical study (including 4260 models). We show that systematically induced correlations in the dataset are being learned and reflected in the latent representations, while widely used disentanglement scores fall short of capturing these latent correlations. Finally, we demonstrate how to disentangle these latent correlations using weak supervision, even if we constrain this supervision to be causally plausible. Our results thus support the argument to learn independent mechanisms rather than independent factors of variations.",
+    "title": "On Disentangled Representations Learned From Correlated Data",
+    "authors": [
+      "Frederik Tr\u00e4uble",
+      "Elliot Creager",
+      "Niki Kilbertus",
+      "Anirudh Goyal",
+      "Francesco Locatello",
+      "Bernhard Sch\u00f6lkopf",
+      "Stefan Bauer"
+    ],
+    "emails": [
+      "~Frederik_Tr\u00e4uble1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Stefan_Bauer1",
+      "~Francesco_Locatello1",
+      "~Elliot_Creager1",
+      "~Niki_Kilbertus1",
+      "~Anirudh_Goyal1"
+    ],
+    "rank": 1567
+  },
+  {
+    "url": "https://openreview.net/forum?id=P84ryxVG6tR",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Accelerating the learning processes for complex tasks by leveraging previously learned tasks has been one of the most challenging problems in reinforcement learning, especially when the similarity between source and target tasks is low or unknown. In this work, we propose a REPresentation-And-INstance Transfer algorithm (REPAINT) for deep actor-critic reinforcement learning paradigm. In representation transfer, we adopt a kickstarted training method using a pre-trained teacher policy by introducing an auxiliary cross-entropy loss. In instance transfer, we develop a sampling approach, i.e., advantage-based experience replay, on transitions collected following the teacher policy, where only the samples with high advantage estimates are retained for policy update. We consider both learning an unseen target task by transferring from previously learned teacher tasks and learning a partially unseen task composed of multiple sub-tasks by transferring from a pre-learned teacher sub-task. In several benchmark experiments, REPAINT significantly reduces the total training time and improves the asymptotic performance compared to training with no prior knowledge and other baselines.",
+    "title": "REPAINT: Knowledge Transfer in Deep Actor-Critic Reinforcement Learning",
+    "authors": [
+      "Yunzhe Tao",
+      "Sahika Genc",
+      "TAO SUN",
+      "Sunil Mallya"
+    ],
+    "emails": [
+      "~TAO_SUN4",
+      "~Yunzhe_Tao2",
+      "~Sunil_Mallya1",
+      "~Sahika_Genc1"
+    ],
+    "rank": 1568
+  },
+  {
+    "url": "https://openreview.net/forum?id=WPO0vDYLXem",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "After developer adjustments to a machine learning (ML) algorithm, how can the results of an old hyperparameter optimization (HPO) automatically be used to speedup a new HPO? This question poses a challenging problem, as developer adjustments can change which hyperparameter settings perform well, or even the hyperparameter search space itself. While many approaches exist that leverage knowledge obtained on previous tasks, so far, knowledge from previous development steps remains entirely untapped. In this work, we remedy this situation and propose a new research framework: hyperparameter transfer across adjustments (HT-AA). To lay a solid foundation for this research framework, we provide four simple HT-AA baseline algorithms and eight benchmarks\nchanging various aspects of ML algorithms, their hyperparameter search spaces, and the neural architectures used. The best baseline, on average and depending on the budgets for the old and new HPO, reaches a given performance 1.2-3.6x faster than a prominent HPO algorithm without transfer. As HPO is a crucial step in ML development but requires extensive computational resources, this speedup would lead to faster development cycles, lower costs, and reduced environmental impacts. To make these benefits available to ML developers off-the-shelf and to facilitate future research on HT-AA, we provide python packages for our baselines and benchmarks.",
+    "title": "Hyperparameter Transfer Across Developer Adjustments",
+    "authors": [
+      "Danny Stoll",
+      "J\u00f6rg K.H. Franke",
+      "Diane Wagner",
+      "Simon Selg",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "cs.uni-freiburg.de",
+      "~J\u00f6rg_K.H._Franke1",
+      "~Frank_Hutter1",
+      "~Danny_Stoll1"
+    ],
+    "rank": 1569
+  },
+  {
+    "url": "https://openreview.net/forum?id=iox4AjpZ15",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      8,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "It is widely believed that a dimension reduction (DR) process drops information inevitably in most practical scenarios. Thus, most methods try to preserve some essential information of data after DR, as well as manifold based DR methods. However, they usually fail to yield satisfying results, especially in high-dimensional cases. In the context of manifold learning, we think that a good low-dimensional representation should preserve the topological and geometric properties of data manifolds, which involve exactly the entire information of the data manifolds. In this paper, we define the problem of information-lossless NLDR with the manifold assumption and propose a novel two-stage NLDR method, called invertible manifold learning (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">inv-ML</mtext></math></mjx-assistive-mml></mjx-container>), to tackle this problem. A <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">local isometry</mtext></math></mjx-assistive-mml></mjx-container> constraint of preserving local geometry is applied under this assumption in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">inv-ML</mtext></math></mjx-assistive-mml></mjx-container>. Firstly, a homeomorphic <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">sparse coordinate transformation</mtext></math></mjx-assistive-mml></mjx-container> is learned to find the low-dimensional representation without losing topological information. Secondly, a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">linear compression</mtext></math></mjx-assistive-mml></mjx-container> is performed on the learned sparse coding, with the trade-off between the target dimension and the incurred information loss. Experiments are conducted on seven datasets with a neural network implementation of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">inv-ML</mtext></math></mjx-assistive-mml></mjx-container>, called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">i-ML-Enc</mtext></math></mjx-assistive-mml></mjx-container>, which demonstrate that the proposed <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">inv-ML</mtext></math></mjx-assistive-mml></mjx-container> not only achieves invertible NLDR in comparison with typical existing methods but also reveals the characteristics of the learned manifolds through linear interpolation in latent space. Moreover, we find that the reliability of tangent space approximated by the local neighborhood on real-world datasets is key to the success of manifold based DR algorithms. The code will be made available soon.",
+    "title": "Invertible Manifold Learning for Dimension Reduction",
+    "authors": [
+      "Siyuan Li",
+      "Haitao Lin",
+      "Zelin Zang",
+      "Lirong Wu",
+      "Jun Xia",
+      "Stan Z. Li"
+    ],
+    "emails": [
+      "~Zelin_Zang2",
+      "~Lirong_Wu1",
+      "~Stan_Z._Li2",
+      "~Jun_Xia1",
+      "~Haitao_Lin2",
+      "~Siyuan_Li6"
+    ],
+    "rank": 1570
+  },
+  {
+    "url": "https://openreview.net/forum?id=p84tly8c4zf",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Data augmentation is a widely used training trick in deep learning to improve the network generalization ability. Despite many encouraging results, several recent studies did point out limitations of the conventional data augmentation scheme in certain scenarios, calling for a better theoretical understanding of data augmentation. In this work, we develop a comprehensive analysis that reveals pros and cons of data augmentation. The main limitation of data augmentation arises from the data bias, i.e. the augmented data distribution can be quite different from the original one. This data bias leads to a suboptimal performance of existing data augmentation methods. To this end, we develop two novel algorithms, termed \"AugDrop\" and \"MixLoss\", to correct the data bias in the data augmentation. Our theoretical analysis shows that both algorithms are guaranteed to improve the effect of data augmentation through the bias correction, which is further validated by our empirical studies. Finally, we propose a generic algorithm \"WeMix\" by combining AugDrop and MixLoss, whose effectiveness is observed from extensive empirical evaluations.",
+    "title": "WeMix: How to Better Utilize Data Augmentation",
+    "authors": [
+      "Yi Xu",
+      "Asaf Noy",
+      "Ming Lin",
+      "Qi Qian",
+      "Li Hao",
+      "Rong Jin"
+    ],
+    "emails": [
+      "~Rong_Jin1",
+      "~Ming_Lin4",
+      "~Yi_Xu8",
+      "~Qi_Qian1",
+      "~Li_Hao1",
+      "alibaba-inc.com"
+    ],
+    "rank": 1571
+  },
+  {
+    "url": "https://openreview.net/forum?id=b7ZRqEFXdQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Generative Adversarial Networks (GAN) are facing great challenges in synthesizing sequences of discrete elements, such as mode dropping and unstable training. The binary classifier in the discriminator may limit the capacity of learning signals and thus hinder the advance of adversarial training. To address such issues, apart from the binary classification feedback, we harness a Feature Statistics Alignment (FSA) paradigm to deliver fine-grained signals in the latent high-dimensional representation space. Specifically, FSA forces the mean statistics of the fake data distribution to approach that of real data as close as possible in a finite-dimensional feature space. Experiments on synthetic and real benchmark datasets show the superior performance in quantitative evaluation and demonstrate the effectiveness of our approach to discrete sequence generation. To the best of our knowledge, the proposed architecture is the first that employs feature alignment regularization in the Gumbel-Softmax based GAN framework for sequence generation. ",
+    "title": "Improving Sequence Generative Adversarial Networks with Feature Statistics Alignment",
+    "authors": [
+      "Yekun Chai",
+      "Qiyue Yin",
+      "Junge Zhang"
+    ],
+    "emails": [
+      "~Yekun_Chai1",
+      "~Qiyue_Yin1",
+      "~Junge_Zhang1"
+    ],
+    "rank": 1572
+  },
+  {
+    "url": "https://openreview.net/forum?id=asLT0W1w7Li",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Balancing exploration and exploitation is crucial in reinforcement learning (RL). In this paper, we study the model-based posterior sampling algorithm in continuous state-action spaces theoretically and empirically. First, we improve the regret bound: with the assumption that reward and transition functions can be modeled as Gaussian Processes with linear kernels, we develop a Bayesian regret bound of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mrow><mn>3</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup><mi>d</mi><msqrt><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> is the episode length, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>d</mi></math></mjx-assistive-mml></mjx-container> is the dimension of the state-action space, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> indicates the total time steps. Our bound can be extended to nonlinear cases as well: using linear kernels on the feature representation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D719 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03d5</mi></math></mjx-assistive-mml></mjx-container>, the Bayesian regret bound becomes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D719 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mrow><mn>3</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup><msub><mi>d</mi><mrow><mi>\u03d5</mi></mrow></msub><msqrt><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D719 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>d</mi><mi>\u03d5</mi></msub></math></mjx-assistive-mml></mjx-container> is the dimension of the representation space. Moreover, we present MPC-PSRL, a model-based posterior sampling algorithm with model predictive control for action selection. To capture the uncertainty in models and realize posterior sampling, we use Bayesian linear regression on the penultimate layer (the feature representation layer <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D719 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03d5</mi></math></mjx-assistive-mml></mjx-container>) of neural networks. Empirical results show that our algorithm achieves the best sample efficiency in benchmark control tasks compared to prior model-based algorithms, and matches the asymptotic performance of model-free algorithms. ",
+    "title": "Efficient Exploration for Model-based Reinforcement Learning with Continuous States and Actions",
+    "authors": [
+      "Ying Fan",
+      "Yifei Ming"
+    ],
+    "emails": [
+      "~Yifei_Ming1",
+      "~Ying_Fan2"
+    ],
+    "rank": 1573
+  },
+  {
+    "url": "https://openreview.net/forum?id=dvSExzhjG9D",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The learning rate (LR) is one of the most important hyper-parameters in stochastic gradient descent (SGD) for deep neural networks (DNN) training and generalization. However, current hand-designed LR schedules need to manually pre-specify a fixed form, which limits their ability to adapt to non-convex optimization problems due to the significant variation of training dynamics. Meanwhile, it always needs to search a proper LR schedule from scratch for new tasks. To address these issues, we propose to parameterize LR schedules with an explicit mapping formulation, called MLR-SNet. The learnable structure brings more flexibility for MLR-SNet to learn a proper LR schedule to comply with the training dynamics of DNN.  Image and text classification benchmark experiments substantiate the capability of our method for achieving proper LR schedules. Moreover, the meta-learned MLR-SNet is tuning-free plug-and-play to generalize to new heterogeneous tasks. We transfer our meta-trained MLR-SNet to tasks like different training epochs, network architectures, datasets, especially large scale ImageNet dataset, and achieve comparable performance with hand-designed LR schedules. Finally, MLR-Net can achieve better robustness when training data is biased with corrupted noise. ",
+    "title": "MLR-SNet: Transferable LR Schedules for Heterogeneous Tasks",
+    "authors": [
+      "Jun Shu",
+      "Yanwen Zhu",
+      "Qian Zhao",
+      "Deyu Meng",
+      "Zongben Xu"
+    ],
+    "emails": [
+      "~Zongben_Xu1",
+      "~Qian_Zhao1",
+      "~Jun_Shu1",
+      "~Deyu_Meng1",
+      "stu.xjtu.edu.cn"
+    ],
+    "rank": 1574
+  },
+  {
+    "url": "https://openreview.net/forum?id=lcNa5mQ-CSb",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      3,
+      5,
+      6
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Causal discovery has witnessed significant progress over the past decades. Most algorithms in causal discovery consider a single domain with a fixed distribution. However, it is commonplace to encounter heterogeneous data (data from different domains with distribution shifts). Applying existing methods on such heterogeneous data may lead to spurious edges or incorrect directions in the learned graph. In this paper, we develop a novel score-based approach for causal discovery from heterogeneous data. Specifically, we propose a Multiple-Domain Score Search (MDSS) algorithm, which is guaranteed to find the correct graph skeleton asymptotically. Furthermore, benefiting from distribution shifts, MDSS enables the detection of more causal directions than previous algorithms designed for single domain data. The proposed MDSS can be readily incorporated into off-the-shelf search strategies, such as the greedy search and the policy-gradient-based search. Theoretical analyses and extensive experiments on both synthetic and real data demonstrate the efficacy of our method.",
+    "title": "Score-based Causal Discovery from Heterogeneous Data",
+    "authors": [
+      "Chenwei Ding",
+      "Biwei Huang",
+      "Mingming Gong",
+      "Kun Zhang",
+      "Tongliang Liu",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Mingming_Gong1",
+      "~Dacheng_Tao1",
+      "~Chenwei_Ding1",
+      "~Biwei_Huang1",
+      "~Kun_Zhang1",
+      "~Tongliang_Liu1"
+    ],
+    "rank": 1575
+  },
+  {
+    "url": "https://openreview.net/forum?id=wTWLfuDkvKp",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Underlying the use of statistical approaches for a wide range of applications is the assumption that the probabilities obtained from a statistical model are representative of the \u201ctrue\u201d probability that event, or outcome, will occur. Unfortunately, for modern deep neural networks this is not the case, they are often observed to be poorly calibrated. Additionally, these deep learning approaches make use of large numbers of model parameters, motivating the use of Bayesian, or ensemble approximation, approaches to handle issues with parameter estimation. This paper explores the application of calibration schemes to deep ensembles from both a theoretical perspective and empirically on a standard image classification task, CIFAR-100. The underlying theoretical requirements for calibration, and associated calibration criteria, are first described. It is shown that well calibrated ensemble members will not necessarily yield a well calibrated ensemble prediction, and if the ensemble prediction is well calibrated its performance cannot exceed that of the average performance of the calibrated ensemble members. On CIFAR-100 the impact of calibration for ensemble prediction, and associated calibration is evaluated. Additionally the situation where multiple different topologies are combined together is discussed.",
+    "title": "Should Ensemble Members Be Calibrated?",
+    "authors": [
+      "Xixin Wu",
+      "Mark Gales"
+    ],
+    "emails": [
+      "~Xixin_Wu1",
+      "~Mark_Gales1"
+    ],
+    "rank": 1576
+  },
+  {
+    "url": "https://openreview.net/forum?id=nsZGadY22N4",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      5,
+      5
+    ],
+    "rating": "5.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Off-policy deep reinforcement learning (RL) has been successful in a range of challenging domains. However, standard off-policy RL algorithms can suffer from low signal and even instability in Q-learning because target values are derived from current Q-estimates, which are often noisy. To mitigate the issue, we propose ensemble-based weighted Bellman backups, which re-weight target Q-values based on uncertainty estimates from a Q-ensemble. We empirically observe that the proposed method stabilizes and improves learning on both continuous and discrete control benchmarks. We also specifically investigate the signal-to-noise aspect by studying environments with noisy rewards, and find that weighted Bellman backups significantly outperform standard Bellman backups. Furthermore, since our weighted Bellman backups rely on maintaining an ensemble, we investigate how weighted Bellman backups interact with UCB Exploration. By enforcing the diversity between agents using Bootstrap, we show that these different ideas are largely orthogonal and can be fruitfully integrated, together further improving the performance of existing off-policy RL algorithms, such as Soft Actor-Critic and Rainbow DQN, for both continuous and discrete control tasks on both low-dimensional and high-dimensional environments.",
+    "title": "Weighted Bellman Backups for Improved Signal-to-Noise in Q-Updates",
+    "authors": [
+      "Kimin Lee",
+      "Michael Laskin",
+      "Aravind Srinivas",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Aravind_Srinivas1",
+      "~Michael_Laskin1",
+      "~Kimin_Lee1",
+      "~Pieter_Abbeel2"
+    ],
+    "rank": 1577
+  },
+  {
+    "url": "https://openreview.net/forum?id=5Zl7WYi7Ndj",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.24",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "The nonlocal-based blocks are designed for capturing long-range spatial-temporal dependencies in computer vision tasks. Although having shown excellent performances, they lack the mechanism to encode the rich, structured information among elements in an image. \nIn this paper, to theoretically analyze the property of these nonlocal-based blocks, we provide a unified framework to interpret them, where we view them as a graph filter generated on a fully-connected graph. When choosing Chebyshev graph filter, a generalized formulation can be derived for explaining the existing nonlocal-based blocks (e.g. nonlocal block, nonlocal stage, double attention block) and uses to analyze their irrationality. Furthermore, by removing the irrationality, we propose an efficient and robust Chebyshev spectral nonlocal block, which can be more flexibly inserted into deep neural networks than the existing nonlocal blocks. Experimental results demonstrate the clear-cut improvements and practical applicabilities of the proposed spectral nonlocal blocks on image classification (Cifar-10/100, ImageNet), fine-grained image classification (CUB-200), action recognition (UCF-101) tasks.",
+    "title": "A Unified Framework to Analyze and Design the Nonlocal Blocks for Neural Networks",
+    "authors": [
+      "Lei Zhu",
+      "Qi She",
+      "Changhu Wang"
+    ],
+    "emails": [
+      "~Qi_She1",
+      "~Lei_Zhu10",
+      "~Changhu_Wang3"
+    ],
+    "rank": 1578
+  },
+  {
+    "url": "https://openreview.net/forum?id=VMtftZqMruq",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.24",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Value decomposition is a popular and promising approach to scaling up multi-agent reinforcement learning in cooperative settings. However, the theoretical understanding of such methods is limited. In this paper, we introduce a variant of the fitted Q-iteration framework for analyzing multi-agent Q-learning with value decomposition. Based on this framework, we derive a closed-form solution to the empirical Bellman error minimization with linear value decomposition. With this novel solution, we further reveal two interesting insights: 1) linear value decomposition implicitly implements a classical multi-agent credit assignment called counterfactual difference rewards; and 2) On-policy data distribution or richer Q function classes can improve the training stability of multi-agent Q-learning. In the empirical study, our experiments demonstrate the realizability of our theoretical closed-form formulation and implications in the didactic examples and a broad set of StarCraft II unit micromanagement tasks, respectively. ",
+    "title": "Towards Understanding Linear Value Decomposition in Cooperative Multi-Agent Q-Learning",
+    "authors": [
+      "Jianhao Wang",
+      "Zhizhou Ren",
+      "Beining Han",
+      "Jianing Ye",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "~Beining_Han1",
+      "~Chongjie_Zhang1",
+      "~Zhizhou_Ren1",
+      "~Jianing_Ye1",
+      "~Jianhao_Wang1"
+    ],
+    "rank": 1579
+  },
+  {
+    "url": "https://openreview.net/forum?id=WMUSP41HQWS",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.23",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Neural ordinary differential equations (Neural ODEs) are appreciated for their ability to significantly reduce the number of parameters when constructing a neural network. On the other hand, they are sometimes blamed for their long forward-pass inference time, which is incurred by solving integral problems. To improve the model accuracy, they rely on advanced solvers, such as the Dormand--Prince (DOPRI) method. To solve an integral problem, however, it requires at least tens (or sometimes thousands) of steps in many Neural ODE experiments. In this work, we propose to i) directly regularize the step size of DOPRI to make the forward-pass faster and ii) dynamically choose a simpler integrator than DOPRI for a carefully selected subset of input. Because it is not the case that every input requires the advanced integrator, we design an auxiliary neural network to choose an appropriate integrator given input to decrease the overall inference time without significantly sacrificing accuracy. We consider the Euler method, the fourth-order Runge--Kutta (RK4) method, and DOPRI as selection candidates. We found that 10-30% of cases can be solved with simple integrators in our experiments. Therefore, the overall number of functional evaluations (NFE) decreases up to 78% with improved accuracy.",
+    "title": "DISE: Dynamic Integrator Selection to Minimize Forward Pass Time in Neural ODEs",
+    "authors": [
+      "Soyoung Kang",
+      "Ganghyeon Park",
+      "Kwang-Sung Jun",
+      "Noseong Park"
+    ],
+    "emails": [
+      "yonsei.ac.kr",
+      "~Kwang-Sung_Jun1",
+      "~Noseong_Park1"
+    ],
+    "rank": 1580
+  },
+  {
+    "url": "https://openreview.net/forum?id=WoLQsYU8aZ",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      7
+    ],
+    "rating": "5.23",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "OpenAI's Gym library  contains a large, diverse set of environments that are useful benchmarks in reinforcement learning, under a single elegant Python API (with tools to develop new compliant environments) . The introduction of this library has proven a watershed moment for the reinforcement learning community, because it created an accessible set of benchmark environments that everyone could use (including wrapper important existing libraries), and because a standardized API let RL learning methods and environments from anywhere be trivially exchanged. This paper similarly introduces PettingZoo, a library of diverse set of multi-agent environments under a single elegant Python API, with tools to easily make new compliant environments.",
+    "title": "PettingZoo: Gym for Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Justin K Terry",
+      "Benjamin Black",
+      "Mario Jayakumar",
+      "Ananth Hari",
+      "Luis Santos",
+      "Clemens Dieffendahl",
+      "Niall L Williams",
+      "Yashas Lokesh",
+      "Ryan Sullivan",
+      "Caroline Horsch",
+      "Praveen Ravi"
+    ],
+    "emails": [
+      "umd.edu",
+      "campus.tu-berlin.de",
+      "swarmlabs.com",
+      "~Justin_K_Terry1"
+    ],
+    "rank": 1581
+  },
+  {
+    "url": "https://openreview.net/forum?id=zcOJOUjUcyF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Reducing the sample complexity associated with deep learning (DL) remains one of the most important problems in both theory and practice since its advent. Semi-supervised learning (SSL) tackles this task by leveraging unlabeled instances which are usually more accessible than their labeled counterparts. Active learning (AL) directly seeks to reduce the sample complexity by training a classification network and querying unlabeled instances to be annotated by a human-in-the-loop. Under relatively strict settings, it has been shown that both SSL and AL can theoretically achieve the same performance of fully-supervised learning (SL) using far less labeled samples. While empirical works have shown that SSL can attain this benefit in practice, DL-based AL algorithms have yet to show their success to the extent achieved by SSL. Given the accessible pool of unlabeled instances in pool-based AL, we argue that the annotation efficiency brought by AL algorithms that seek diversity on labeled samples can be improved upon when using SSL as the training scheme. Equipped with a few theoretical insights, we designed an AL algorithm that rather focuses on controlling the convergence rate of a classification network by actively querying instances to improve the rate of convergence upon inclusion to the labeled set. We name this AL scheme convergence rate control (CRC), and our experiments show that a deep neural network trained using a combination of CRC and a recently proposed SSL algorithm can quickly achieve high performance using far less labeled samples than SL. In contrast to a few works combining independently developed AL and SSL (ASSL) algorithms, our method is a natural fit to ASSL, and we hope our work can catalyze research combining AL and SSL as opposed to an exclusion of either.",
+    "title": "Better Optimization can Reduce Sample Complexity: Active Semi-Supervised Learning via Convergence Rate Control",
+    "authors": [
+      "Seo Taek Kong",
+      "Soomin Jeon",
+      "Jaewon Lee",
+      "Hong-Seok Lee",
+      "Kyu-Hwan Jung"
+    ],
+    "emails": [
+      "~Seo_Taek_Kong1",
+      "vuno.co",
+      "~Kyu-Hwan_Jung1",
+      "~Jaewon_Lee2"
+    ],
+    "rank": 1582
+  },
+  {
+    "url": "https://openreview.net/forum?id=4f04RAhMUo6",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Current reinforcement learning (RL) methods use simulation models as simple black-box oracles. In this paper, with the goal of improving the performance exhibited by RL algorithms, we explore a systematic way of leveraging the additional information provided by an emerging class of differentiable simulators. Building on concepts established by Deterministic Policy Gradients (DPG) methods, the neural network policies learned with our approach represent deterministic actions. In a departure from standard methodologies, however, learning these policy does not hinge on approximations of the value function that must be learned concurrently in an actor-critic fashion. Instead, we exploit differentiable simulators to directly compute the analytic gradient of a policy's value function with respect to the actions it outputs. This, in turn, allows us to efficiently perform locally optimal policy improvement iterations. Compared against other state-of-the-art RL methods, we show that with minimal hyper-parameter tuning our approach consistently leads to better asymptotic behavior across a set of payload manipulation tasks that demand high precision.",
+    "title": "PODS: Policy Optimization via Differentiable Simulation",
+    "authors": [
+      "Miguel Angel Zamora Mora",
+      "Momchil Peychev",
+      "Sehoon Ha",
+      "Martin Vechev",
+      "Stelian Coros"
+    ],
+    "emails": [
+      "ethz.ch",
+      "~Stelian_Coros1",
+      "~Martin_Vechev1",
+      "gmail.com",
+      "~Miguel_Angel_Zamora_Mora1"
+    ],
+    "rank": 1583
+  },
+  {
+    "url": "https://openreview.net/forum?id=H92-E4kFwbR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "One intriguing property of deep neural networks (DNNs) is their vulnerability to adversarial perturbations. Despite the plethora of work on defending against individual perturbation models, improving DNN robustness against the combinations of multiple perturbations is still fairly under-studied. In this paper, we propose \\underline{c}omposite \\underline{a}dversarial \\underline{t}raining (CAT), a novel training method that flexibly integrates and optimizes multiple adversarial losses, leading to significant robustness improvement with respect to individual perturbations as well as their ``compositions''. Through empirical evaluation on benchmark datasets and models, we show that CAT outperforms existing adversarial training methods by large margins in defending against the compositions of pixel perturbations and spatial transformations, two major classes of adversarial perturbation models, while incurring limited impact on clean inputs.",
+    "title": "Composite Adversarial Training for Multiple Adversarial Perturbations and Beyond",
+    "authors": [
+      "Xinyang Zhang",
+      "Zheng Zhang",
+      "Ting Wang"
+    ],
+    "emails": [
+      "psu.edu",
+      "~Xinyang_Zhang5",
+      "~Ting_Wang1"
+    ],
+    "rank": 1584
+  },
+  {
+    "url": "https://openreview.net/forum?id=WZnVnlFBKFj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We study federated learning (FL), which enables mobile devices to utilize their local datasets to collaboratively train a global model with the help of a central server, while keeping data localized. At each iteration, the server broadcasts the current global model to the devices for local training, and aggregates the local model updates from the devices to update the global model. Previous work on the communication efficiency of FL has mainly focused on the aggregation of model updates from the devices, assuming perfect broadcasting of the global model. In this paper, we instead consider broadcasting a compressed version of the global model. This is to further reduce the communication cost of FL, which can be particularly limited when the global model is to be transmitted over a wireless medium. We introduce a lossy FL (LFL) algorithm, in which both the global model and the local model updates are quantized before being transmitted. We analyze the convergence behavior of the proposed LFL algorithm assuming the availability of accurate local model updates at the server. Numerical experiments show that the proposed LFL scheme, which quantizes the global model update (with respect to the global model estimate at the devices) rather than the global model itself, significantly outperforms other existing schemes studying quantization of the global model at the PS-to-device direction. Also, the performance loss of the proposed scheme is marginal compared to the fully lossless approach, where the PS and the devices transmit their messages entirely without any quantization.",
+    "title": "Federated Learning With Quantized Global Model Updates",
+    "authors": [
+      "Mohammad Mohammadi Amiri",
+      "Deniz Gunduz",
+      "Sanjeev Kulkarni",
+      "H. Vincent Poor"
+    ],
+    "emails": [
+      "imperial.ac.uk",
+      "~H._Vincent_Poor1",
+      "~Sanjeev_Kulkarni1",
+      "~Mohammad_Mohammadi_Amiri1"
+    ],
+    "rank": 1585
+  },
+  {
+    "url": "https://openreview.net/forum?id=PAsd7_vP4_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Controlling the movements of highly articulated agents and robots has been a long-standing challenge to model-free deep reinforcement learning. In this paper, we propose a simple, yet general, framework for improving the performance of policy gradient algorithms by discretizing the continuous action space. Instead of using a fixed set of predetermined atomic actions, we exploit particle filtering to adaptively discretize actions during training and track the posterior policy represented as a mixture distribution. The resulting policy can replace the original continuous policy of any given policy gradient algorithm without changing its underlying model architecture. We demonstrate the applicability of our approach to state-of-the-art on-policy and off-policy baselines in challenging control tasks. Baselines using our particle-based policies achieve better final performance and speed of convergence as compared to corresponding continuous implementations and implementations that rely on fixed discretization schemes. ",
+    "title": "Adaptive Discretization for Continuous Control using Particle Filtering Policy Network",
+    "authors": [
+      "Pei Xu",
+      "Ioannis Karamouzas"
+    ],
+    "emails": [
+      "~Pei_Xu1",
+      "~Ioannis_Karamouzas1"
+    ],
+    "rank": 1586
+  },
+  {
+    "url": "https://openreview.net/forum?id=uQnJqzkhrmj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Circuit routing has been a historically challenging problem in designing electronic systems such as very large-scale integration (VLSI) and printed circuit boards (PCBs). The main challenge is that connecting a large number of electronic components under specific design rules and constraints involves a very large search space, which is proved to be NP-complete. \nEarly solutions are typically designed with hard-coded heuristics, which suffer from problems of non-optimum solutions and lack of flexibility for new design needs. Although a few learning-based methods have been proposed recently, their methods are cumbersome and hard to extend to large-scale applications. In this work, we propose a new algorithm for circuit routing, named as Ranking Cost (RC),  which innovatively combines search-based methods (i.e., A* algorithm) and learning-based methods (i.e., Evolution Strategies) to form an efficient and trainable router under a proper parameterization. Different from two-stage routing methods ( i.e., first global routing and then detailed routing), our method involves a one-stage procedure that directly optimizes the global objective function, thus it can be easy to adapt to new routing rules and constraints. In our method, we introduce a new set of variables called cost maps, which can help the A* router to find out proper paths to achieve the global object. We also train a ranking parameter, which can produce the ranking order and further improve the performance of our method. Our algorithm is trained in an end-to-end manner and does not use any artificial data or human demonstration. In the experiments, we compare our method with the sequential A* algorithm and a canonical reinforcement learning approach, and results show that our method outperforms these baselines with higher connectivity rates and better scalability. Our ablation study shows that our trained cost maps can capture the global information and guide the routing result to approach global optimum.",
+    "title": "Ranking Cost: One-Stage Circuit Routing by Directly Optimizing Global Objective Function",
+    "authors": [
+      "Shiyu Huang",
+      "Bin Wang",
+      "Dong Li",
+      "Jianye Hao",
+      "Jun Zhu",
+      "Ting Chen"
+    ],
+    "emails": [
+      "~Jun_Zhu2",
+      "~Bin_Wang12",
+      "tsinghua.edu.cn",
+      "~Shiyu_Huang2",
+      "huawei.com"
+    ],
+    "rank": 1587
+  },
+  {
+    "url": "https://openreview.net/forum?id=Wga_hrCa3P3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.23",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Recently, sequence-to-sequence (seq2seq) models with the Transformer architecture have achieved remarkable performance on various conditional text generation tasks, such as machine translation. However, most of them are trained with teacher forcing with the ground truth label given at each time step, without being exposed to incorrectly generated tokens during training, which hurts its generalization to unseen inputs, that is known as the \"exposure bias\" problem. In this work, we propose to solve the conditional text generation problem by contrasting positive pairs with negative pairs, such that the model is exposed to various valid or incorrect perturbations of the inputs, for improved generalization. However, training the model with na\u00efve contrastive learning framework using random non-target sequences as negative examples is suboptimal, since they are easily distinguishable from the correct output, especially so with models pretrained with large text corpora. Also, generating positive examples requires domain-specific augmentation heuristics which may not generalize over diverse domains. To tackle this problem, we propose a principled method to generate positive and negative samples for contrastive learning of seq2seq models. Specifically, we generate negative examples by adding small perturbations to the input sequence to minimize its conditional likelihood, and positive examples by adding  large perturbations while enforcing it to have a high conditional likelihood. Such `\"hard'' positive and negative pairs generated using our method guides the model to better distinguish correct outputs from incorrect ones. We empirically show that our proposed method significantly improves the generalization of the seq2seq on three text generation tasks --- machine translation, text summarization, and question generation.",
+    "title": "Contrastive  Learning  with Adversarial Perturbations for Conditional Text Generation",
+    "authors": [
+      "Seanie Lee",
+      "Dong Bok Lee",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Dong_Bok_Lee1",
+      "~Seanie_Lee1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 1588
+  },
+  {
+    "url": "https://openreview.net/forum?id=YEv8xafoAQ",
+    "ratings": [
+      4,
+      6,
+      6
+    ],
+    "rating": "5.23",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "In recent years, text-guided image manipulation has gained increasing attention in the image generation research field. Recent works have proposed to deal with a simplified setting where the input image only has a single object and the text modification is acquired by swapping image captions or labels. In this paper, we study a setting that allows users to edit an image with multiple objects using complex text instructions. In this image generation task, the inputs are a reference image and an instruction in natural language that describes desired modifications to the input image. We propose a GAN-based method to tackle this problem. The key idea is to treat text as neural operators to locally modify the image feature. We show that the proposed model performs favorably against recent baselines on three public datasets.",
+    "title": "Text as Neural Operator: Image Manipulation by Text Instruction",
+    "authors": [
+      "Tianhao Zhang",
+      "Hung-Yu Tseng",
+      "Lu Jiang",
+      "Honglak Lee",
+      "Irfan Essa",
+      "Weilong Yang"
+    ],
+    "emails": [
+      "~Tianhao_Zhang2",
+      "~Lu_Jiang1",
+      "~Irfan_Essa1",
+      "~Honglak_Lee2",
+      "~Hung-Yu_Tseng2",
+      "~Weilong_Yang1"
+    ],
+    "rank": 1589
+  },
+  {
+    "url": "https://openreview.net/forum?id=rSwTMomgCz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      7
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "The goal of meta-reinforcement learning (meta-RL) is to build agents that can quickly learn new tasks by leveraging prior experience on related tasks. Learning a new task often requires both exploring to gather task-relevant information and exploiting this information to solve the task. In principle, optimal exploration and exploitation can be learned end-to-end by simply maximizing task performance. However, such meta-RL approaches struggle with local optima due to a chicken-and-egg problem: learning to explore requires good exploitation to gauge the exploration's utility, but learning to exploit requires information gathered via exploration. Optimizing separate objectives for exploration and exploitation can avoid this problem, but prior meta-RL exploration objectives yield suboptimal policies that gather information irrelevant to the task. We alleviate both concerns by constructing an exploitation objective that automatically identifies task-relevant information and an exploration objective to recover only this information. This avoids local optima in end-to-end training, without sacrificing optimal exploration. Empirically, DREAM substantially outperforms existing approaches on complex meta-RL problems, such as sparse-reward 3D visual navigation.",
+    "title": "Decoupling Exploration and Exploitation for Meta-Reinforcement Learning without Sacrifices",
+    "authors": [
+      "Evan Zheran Liu",
+      "Aditi Raghunathan",
+      "Percy Liang",
+      "Chelsea Finn"
+    ],
+    "emails": [
+      "~Chelsea_Finn1",
+      "~Percy_Liang1",
+      "~Evan_Zheran_Liu1",
+      "~Aditi_Raghunathan1"
+    ],
+    "rank": 1590
+  },
+  {
+    "url": "https://openreview.net/forum?id=5mhViEOQxaV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "A multi-task learning (MTL) system aims at solving multiple related tasks at the same time.  With a fixed model capacity, the tasks would be conflicted with each other, and the system usually has to make a trade-off among learning all of them together. Multiple models with different preferences over tasks have to be trained and stored for many real-world applications where the trade-off has to be made online. This work proposes a novel controllable Pareto multi-task learning framework, to enable the system to make real-time trade-off switch among different tasks with a single model. To be specific, we formulate the MTL as a preference-conditioned multiobjective optimization problem, for which there is a parametric mapping from the preferences to the Pareto stationary solutions. A single hypernetwork-based multi-task neural network is built to learn all tasks with different trade-off preferences among them, where the hypernetwork generates the model parameters conditioned on the preference. At the inference time, MTL practitioners can easily control the model performance based on different trade-off preferences in real-time. Experiments on different applications demonstrate that the proposed model is efficient for solving various multi-task learning problems. ",
+    "title": "Controllable Pareto Multi-Task Learning",
+    "authors": [
+      "Xi Lin",
+      "Zhiyuan YANG",
+      "Qingfu Zhang",
+      "Sam Kwong"
+    ],
+    "emails": [
+      "~Zhiyuan_YANG1",
+      "~Sam_Kwong1",
+      "cityu.edu.hk",
+      "~Xi_Lin2"
+    ],
+    "rank": 1591
+  },
+  {
+    "url": "https://openreview.net/forum?id=kcqSDWySoy",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4
+    ],
+    "rating": "5.23",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Approximating the numerical solutions of partial differential equations (PDEs) using neural networks is a promising application of deep learning. The smooth architecture of a fully connected neural network is appropriate for finding the solutions of PDEs; the corresponding loss function can also be intuitively designed and guarantees the convergence for various kinds of PDEs. However, the rate of convergence has been considered as a weakness of this approach. This paper introduces a novel loss function for the training of neural networks to find the solutions of PDEs, making the training substantially efficient. Inspired by the recent studies that incorporate derivative information for the training of neural networks, we develop a loss function that guides a neural network to reduce the error in the corresponding Sobolev space. Surprisingly, a simple modification of the loss function can make the training process similar to Sobolev Training although solving PDEs with neural networks is not a fully supervised learning task. We provide several theoretical justifications for such an approach for the viscous Burgers equation and the kinetic Fokker--Planck equation. We also present several simulation results, which show that compared with the traditional <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> loss function, the proposed loss function guides the neural network to a significantly faster convergence. Moreover, we provide the empirical evidence that shows that the proposed loss function, together with the iterative sampling techniques, performs better in solving high dimensional PDEs.",
+    "title": "Sobolev Training for the Neural Network Solutions of PDEs",
+    "authors": [
+      "Hwijae Son",
+      "Jin Woo Jang",
+      "Woo Jin Han",
+      "Hyung Ju Hwang"
+    ],
+    "emails": [
+      "postech.ac.kr",
+      "~Hwijae_Son1",
+      "~Hyung_Ju_Hwang1",
+      "iam.uni-bonn.de"
+    ],
+    "rank": 1592
+  },
+  {
+    "url": "https://openreview.net/forum?id=iOVomQW073",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.23",
+    "confidences": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Top-k predictions are used in many real-world applications such as machine learning as a service, recommender systems, and web searches. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-norm adversarial perturbation characterizes an attack that arbitrarily modifies some features of an input such that a classifier makes an incorrect prediction for the perturbed input. <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-norm adversarial perturbation is easy to interpret and can be implemented in the physical world. Therefore, certifying robustness of top-k predictions against <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-norm adversarial perturbation is important. However, existing studies either focused on certifying <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-norm robustness of top-1 predictions or <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-norm robustness of top-k predictions. In this work, we aim to bridge the gap. Our approach is based on randomized smoothing, which builds a provably robust classifier from an arbitrary classifier via randomizing an input. Our major theoretical contribution is an almost tight <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>-norm certified robustness guarantee for top-k predictions. We empirically evaluate our method on CIFAR10 and ImageNet. For instance, our method can build a classifier that achieves a certified top-3 accuracy of 69.2\\% on ImageNet when an attacker can arbitrarily perturb 5 pixels of a testing image. We will publish our code upon paper acceptance.",
+    "title": "Almost Tight L0-norm Certified Robustness of Top-k Predictions against Adversarial Perturbations",
+    "authors": [
+      "Jinyuan Jia",
+      "Binghui Wang",
+      "Xiaoyu Cao",
+      "Hongbin Liu",
+      "Neil Zhenqiang Gong"
+    ],
+    "emails": [
+      "~Xiaoyu_Cao1",
+      "duke.edu",
+      "~Jinyuan_Jia2",
+      "~Neil_Zhenqiang_Gong1",
+      "~Binghui_Wang2"
+    ],
+    "rank": 1593
+  },
+  {
+    "url": "https://openreview.net/forum?id=SOVSJZ9PTO7",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Knowledge graphs (KGs) contain rich information about world knowledge, entities, and relations. Thus, they can be great supplements to existing pre-trained language models. However, it remains a challenge to efficiently integrate information from KG into language modeling. And the understanding of a knowledge graph requires related context. We propose a novel joint pre-training framework, JAKET, to model both the knowledge graph and language. The knowledge module and language module provide essential information to mutually assist each other: the knowledge module produces embeddings for entities in text while the language module generates context-aware initial embeddings for entities and relations in the graph. Our design enables the pre-trained model to easily adapt to unseen knowledge graphs in new domains. Experimental results on several knowledge-aware NLP tasks show that our proposed framework achieves superior performance by effectively leveraging knowledge in language understanding.",
+    "title": "JAKET: Joint Pre-training of Knowledge Graph and Language Understanding",
+    "authors": [
+      "Donghan Yu",
+      "Chenguang Zhu",
+      "Yiming Yang",
+      "Michael Zeng"
+    ],
+    "emails": [
+      "~Michael_Zeng1",
+      "~Chenguang_Zhu1",
+      "~Donghan_Yu2",
+      "~Yiming_Yang1"
+    ],
+    "rank": 1594
+  },
+  {
+    "url": "https://openreview.net/forum?id=qf6Nmm-_6Z",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "In this paper, we study how to simultaneously learn two highly correlated tasks of graph analysis, i.e., community detection and node representation learning. We propose an efficient generative model called VECoDeR for jointly learning Variational Embeddings for COmmunity DEtection and node Representation. VECoDeR assumes that every node can be a member of one or more communities. The node embeddings are learned in such a way that connected nodes are not only ``closer\" to each other but also share similar community assignments. A joint learning framework leverages community-aware node embeddings for better community detection. We demonstrate on several graph datasets that VECoDeR effectively outperforms many competitive baselines on all three tasks i.e. node classification, overlapping community detection and non-overlapping community detection. We also show that VECoDeR is computationally efficient and has quite robust performance with varying hyperparameters.",
+    "title": "VECoDeR - Variational Embeddings for Community Detection and Node Representation",
+    "authors": [
+      "Rayyan Ahmad Khan",
+      "Muhammad Umer Anwaar",
+      "Omran Kaddah",
+      "Martin Kleinsteuber"
+    ],
+    "emails": [
+      "~Omran_Kaddah1",
+      "~Martin_Kleinsteuber1",
+      "~Muhammad_Umer_Anwaar1",
+      "~Rayyan_Ahmad_Khan1"
+    ],
+    "rank": 1595
+  },
+  {
+    "url": "https://openreview.net/forum?id=Fn5wiAq2SR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "To protect the security of machine learning models against adversarial examples, adversarial training becomes the most popular and powerful strategy against various adversarial attacks by injecting adversarial examples into training data. However, it is time-consuming and requires high computation complexity to generate suitable adversarial examples for ensuring the robustness of models, which impedes the spread and application of adversarial training. In this work, we reformulate adversarial training as a combination of stationary distribution exploring, sampling, and training. Each updating of parameters of DNN is based on several transitions from the data samples as the initial states in a Hamiltonian system. Inspired by our new paradigm, we design a new generative method for adversarial training by using Contrastive Divergence (ATCD), which approaches the equilibrium distribution of adversarial examples with only few iterations by building from small modifications of the standard Contrastive Divergence (CD). Our adversarial training algorithm achieves much higher robustness than any other state-of-the-art adversarial training acceleration method on the ImageNet, CIFAR-10, and MNIST datasets and reaches a balance between performance and efficiency.",
+    "title": "Adversarial Training using Contrastive Divergence",
+    "authors": [
+      "Hongjun Wang",
+      "Guanbin Li",
+      "Liang Lin"
+    ],
+    "emails": [
+      "~Liang_Lin1",
+      "~Guanbin_Li2",
+      "~Hongjun_Wang2"
+    ],
+    "rank": 1596
+  },
+  {
+    "url": "https://openreview.net/forum?id=IU8QxEiG4hR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Accurate layout estimation is crucial for planning and navigation, for robotics applications such as self driving. In this paper, we introduce stereo bird's eye view network SBEVNet, a novel supervised end-to-end framework for estimation of bird's eye view layout from a pair of stereo images. Although our network reuses the building blocks from the state-of-the-art deep learning networks for disparity estimation, we show that accurate depth estimation is neither sufficient nor necessary. Instead, the learning of a good internal bird's eye view feature representation is essential for layout estimation. Specifically, we first generate a disparity feature volume using the features of the stereo images and then project it to the bird's eye view coordinates. This gives us coarse grained scene structural information. We also apply inverse perspective mapping (IPM) to map the input images and their features to the bird's eye view. This gives us fine grained texture information. The concatenated IPM features with the projected feature volume creates a rich bird's eye view representation which is capable of spatial reasoning. We use this representation to estimate the BEV semantic map. Additionally, we show that using the IPM features as a supervisory signal for stereo features can give an improvement in performance. We demonstrate our approach on two datasets: KITTI dataset and synthetically generated dataset using the CARLA simulator. For both of the datasets, we establish state-of-the-art performance beyond other baselines.",
+    "title": "SBEVNet: End-to-End Deep Stereo Layout Estimation",
+    "authors": [
+      "Divam Gupta",
+      "Wei Pu",
+      "Trenton Tabor",
+      "Jeff Schneider"
+    ],
+    "emails": [
+      "~Trenton_Tabor1",
+      "~Divam_Gupta1",
+      "nrec.ri.cmu.edu",
+      "~Jeff_Schneider1"
+    ],
+    "rank": 1597
+  },
+  {
+    "url": "https://openreview.net/forum?id=49V11oUejQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.22",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Adversarial training is so far the most effective strategy in defending against adversarial examples. However, it suffers from high computational cost due to the iterative adversarial attacks in each training step. Recent studies show that it is possible to achieve Fast Adversarial Training by performing a single-step attack with random initialization. Yet, it remains a mystery why random initialization helps. Besides, such an approach still lags behind state-of-the-art adversarial training algorithms on both stability and model robustness. In this work, we develop a new understanding towards Fast Adversarial Training, by viewing random initialization as performing randomized smoothing for better optimization of the inner maximization problem. From this perspective, we show that the smoothing effect by random initialization is not sufficient under the adversarial perturbation constraint. A new initialization strategy, \\emph{backward smoothing}, is proposed to address this issue and significantly improves both stability and model robustness over single-step robust training methods. Experiments on multiple benchmarks demonstrate that our method achieves similar model robustness as the original TRADES method, while using much less training time (~3x improvement with the same training schedule). ",
+    "title": "Efficient Robust Training via Backward Smoothing",
+    "authors": [
+      "Jinghui Chen",
+      "Yu Cheng",
+      "Zhe Gan",
+      "Quanquan Gu",
+      "Jingjing Liu"
+    ],
+    "emails": [
+      "~Jingjing_Liu2",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1",
+      "~Quanquan_Gu1",
+      "~Jinghui_Chen1"
+    ],
+    "rank": 1598
+  },
+  {
+    "url": "https://openreview.net/forum?id=WW8VEE7gjx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      4,
+      3,
+      2
+    ],
+    "abstract": "We reformulate unsupervised dimension reduction problem (UDR) in the language of tempered distributions, i.e. as a problem of approximating an empirical probability density function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c65\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c70\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mrow><mrow><mi mathvariant=\"normal\">e</mi><mi mathvariant=\"normal\">m</mi><mi mathvariant=\"normal\">p</mi></mrow></mrow></msub><mo stretchy=\"false\">(</mo><mrow><mrow><mi mathvariant=\"bold\">x</mi></mrow></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> by another tempered distribution <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45E TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>q</mi><mo stretchy=\"false\">(</mo><mrow><mrow><mi mathvariant=\"bold\">x</mi></mrow></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> whose support is in a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-dimensional subspace. Thus, our problem is reduced to the minimization of the distance between <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>q</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c65\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c70\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mrow><mrow><mi mathvariant=\"normal\">e</mi><mi mathvariant=\"normal\">m</mi><mi mathvariant=\"normal\">p</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45E TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c65\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6D\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c70\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>D</mi><mo stretchy=\"false\">(</mo><mi>q</mi><mo>,</mo><msub><mi>p</mi><mrow><mrow><mi mathvariant=\"normal\">e</mi><mi mathvariant=\"normal\">m</mi><mi mathvariant=\"normal\">p</mi></mrow></mrow></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, over a pertinent set of generalized functions.\n\nThis infinite-dimensional formulation allows to establish a connection with another classical problem of data science --- the sufficient dimension reduction problem (SDR). Thus, an algorithm for the first problem induces an algorithm for the second and vice versa. In order to reduce an optimization problem over distributions to an optimization problem over ordinary functions we introduce a nonnegative penalty function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi><mo stretchy=\"false\">(</mo><mi>f</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> that ``forces'' the support of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container> to be <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-dimensional.\nThen we present an algorithm for minimization of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D706 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>I</mi><mo stretchy=\"false\">(</mo><mi>f</mi><mo stretchy=\"false\">)</mo><mo>+</mo><mi>\u03bb</mi><mi>R</mi><mo stretchy=\"false\">(</mo><mi>f</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, based on the idea of two-step iterative computation, briefly described as a) an adaptation to real data and to fake data sampled around a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-dimensional subspace found at a previous iteration, b) calculation of a new <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-dimensional subspace. We demonstrate the method on 4 examples (3 UDR and 1 SDR) using synthetic data and standard datasets.",
+    "title": "Dimension reduction as an optimization problem over a set of generalized functions",
+    "authors": [
+      "Rustem Takhanov"
+    ],
+    "emails": [
+      "~Rustem_Takhanov1"
+    ],
+    "rank": 1599
+  },
+  {
+    "url": "https://openreview.net/forum?id=4pN0NjwSoPR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.22",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Model quantization is to discretize weights and activations of a deep neural network (DNN).\nUnlike previous methods that manually defined the quantization hyperparameters such as precision (\\ie bitwidth), dynamic range (\\ie minimum and maximum discrete values) and stepsize (\\ie interval between discrete values),\nthis work proposes a novel approach to differentiably learn all of them, named Differentiable Dynamic Quantization (DDQ), which possesses several appealing benefits. (1) Unlike previous works that applied the rounding operation to discretize values, DDQ provides a unified perspective by formulating discretization as a matrix-vector product, where different values of the matrix and vector represent different quantization methods such as mixed precision and soft quantization, and their values can be learned differentiably from training data, making different hidden layers in a DNN used different quantization methods. \n(2) DDQ is hardware-friendly, where all variables can be computed by using low-precision matrix-vector multiplication, making it capable in wide spectrum of hardwares.\n(3) The matrix variable is carefully reparameterized to reduce its number of parameters from O(2^{b^2}) to O(\\log2^b), where b is the bit width.\nExtensive experiments show that DDQ outperforms prior arts on various advanced networks and benchmarks. For instance, compared to the full-precision models, MobileNetv2 trained with DDQ achieves comparable top1 accuracy on ImageNet (71.7% vs 71.9%), while ResNet18 trained with DDQ increases accuracy by 0.5%.\nThese results relatively improve recent state-of-the-art quantization methods by 70% and 140% compared to the full-precision models.\n",
+    "title": "Differentiable Dynamic Quantization with Mixed Precision and Adaptive Resolution",
+    "authors": [
+      "Zhaoyang Zhang",
+      "Wenqi Shao",
+      "Jinwei Gu",
+      "Xiaogang Wang",
+      "Ping Luo"
+    ],
+    "emails": [
+      "~Xiaogang_Wang2",
+      "~Jinwei_Gu1",
+      "~Ping_Luo2",
+      "~Zhaoyang_Zhang1",
+      "~Wenqi_Shao2"
+    ],
+    "rank": 1600
+  },
+  {
+    "url": "https://openreview.net/forum?id=C_p3TDhOXW_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Active inference may be defined as Bayesian modeling of a brain with a biologically plausible model of the agent. Its primary idea relies on the free energy principle and the prior preference of the agent. An agent will choose an action that leads to its prior preference for a future observation. In this paper, we claim that active inference can be interpreted using reinforcement learning (RL) algorithms and find a theoretical connection between them. We extend the concept of expected free energy (EFE), which is a core quantity in active inference, and claim that EFE can be treated as a negative value function. Motivated by the concept of prior preference and a theoretical connection, we propose a simple but novel method for learning a prior preference from experts. This illustrates that the problem with RL can be approached with a new perspective of active inference. Experimental results of prior preference learning show the possibility of active inference with EFE-based rewards and its application to an inverse RL problem.",
+    "title": "Prior Preference Learning From Experts: Designing A Reward with Active Inference",
+    "authors": [
+      "Jin Young Shin",
+      "Cheolhyeong Kim",
+      "Hyung Ju Hwang"
+    ],
+    "emails": [
+      "~Jin_Young_Shin1",
+      "~Cheolhyeong_Kim1",
+      "~Hyung_Ju_Hwang1"
+    ],
+    "rank": 1601
+  },
+  {
+    "url": "https://openreview.net/forum?id=OyDjznG-x2e",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.22",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Error correction codes are an integral part of communication applications and boost the reliability of transmission. The optimal decoding of transmitted codewords is the maximum likelihood rule, which is NP-hard. For practical realizations, suboptimal decoding algorithms are employed; however, the lack of theoretical insights currently impedes the exploitation of the full potential of these algorithms. One key insight is the choice of permutation in permutation decoding. We present a data-driven framework for permutation selection combining domain knowledge with machine learning concepts such as node embedding and self-attention. Significant and consistent improvements in the bit error rate are shown for the simulated Bose Chaudhuri Hocquenghem (BCH) code as compared to the baseline decoders. To the best of our knowledge, this work is the first to leverage the benefits of self-attention networks in physical layer communication systems.\n",
+    "title": "Graph Permutation Selection for Decoding of Error Correction Codes using Self-Attention",
+    "authors": [
+      "Nir Raviv",
+      "Avi Caciularu",
+      "Tomer Raviv",
+      "Jacob Goldberger",
+      "Yair Be'ery"
+    ],
+    "emails": [
+      "~Avi_Caciularu1",
+      "gmail.com",
+      "post.tau.ac.il",
+      "~Jacob_Goldberger1"
+    ],
+    "rank": 1602
+  },
+  {
+    "url": "https://openreview.net/forum?id=Gc4MQq-JIgj",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4
+    ],
+    "rating": "5.22",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "Practical reinforcement learning problems are often formulated as constrained Markov decision process (CMDP) problems, in which the agent has to maximize the expected return while satisfying a set of prescribed safety constraints. In this study, we consider a situation in which the agent has access to the generative model which provides us with a next state sample for any given state-action pair, and propose a model to solve a CMDP problem by decomposing the CMDP into a pair of MDPs; \\textit{reconnaissance} MDP (R-MDP) and \\textit{planning} MDP (P-MDP). In R-MDP, we train threat function, the Q-function analogue of danger that can determine whether a given state-action pair is safe or not. In P-MDP, we train a reward-seeking policy while using a fixed threat function to determine the safeness of each action. With the help of generative model, we can efficiently train the threat function by preferentially sampling rare dangerous events. Once the threat function for a baseline policy is computed, we can solve other CMDP problems with different reward and different danger-constraint without the need to re-train the model. We also present an efficient approximation method for the threat function that can greatly reduce the difficulty of solving R-MDP. We will demonstrate the efficacy of our method over classical approaches in benchmark dataset and complex collision-free navigation tasks.",
+    "title": "Reconnaissance for reinforcement learning with safety constraints",
+    "authors": [
+      "Shin-ichi Maeda",
+      "Hayato Watahiki",
+      "Yi Ouyang",
+      "Shintarou Okada",
+      "Masanori Koyama"
+    ],
+    "emails": [
+      "preferred.jp",
+      "~Hayato_Watahiki1",
+      "~Masanori_Koyama1",
+      "~Yi_Ouyang1",
+      "~Shin-ichi_Maeda2"
+    ],
+    "rank": 1603
+  },
+  {
+    "url": "https://openreview.net/forum?id=sbyjwhxxT8K",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.22",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks (GNNs) have attracted increasing interests. With broad deployments of GNNs in real-world applications, there is an urgent need for understanding the robustness of GNNs under adversarial attacks, especially in realistic setups. In this work, we study the problem of attacking GNNs in a restricted near-black-box setup, by perturbing the features of a small set of nodes,  with no access to model parameters and model predictions. Our formal analysis draws a connection between this type of attacks and an influence maximization problem on the graph. This connection not only enhances our understanding on the problem of adversarial attack on GNNs, but also allows us to propose a group of effective near-black-box attack strategies. Our experiments verify that the proposed strategies significantly degrade the performance of three popular GNN models and outperform baseline adversarial attack strategies. ",
+    "title": "Near-Black-Box Adversarial Attacks on Graph Neural Networks as An Influence Maximization Problem",
+    "authors": [
+      "Jiaqi Ma",
+      "Junwei Deng",
+      "Qiaozhu Mei"
+    ],
+    "emails": [
+      "~Jiaqi_Ma1",
+      "umich.edu",
+      "~Qiaozhu_Mei1"
+    ],
+    "rank": 1604
+  },
+  {
+    "url": "https://openreview.net/forum?id=TEtO5qiBYvE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6
+    ],
+    "rating": "5.22",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Existing reasoning tasks often follow the setting of \u2018\u2019end-to-end reasoning'', which has an important assumption that the input contents can be always accessed while reasoning. However, human beings frequently adopt another reasoning setting in daily life, referred to \u2018\u2019reasoning after memorizing''. Concretely, human beings have the ability to unconsciously memorize their experiences within limited memory capacity, from which they can recall and respond to subsequent tasks. In this setting, the input contents are no longer available during reasoning, thus we need to compress and memorize the input stream in one pass, trying to answer general queries that are unseen before. Memory augmented neural networks introduce a write-read memory to perform such human-like memorization and reasoning, but they continually update the memory from current information and inevitably forget the early contents, failing to answer the queries relevant to early information.  In this paper, we propose the Continual Memory (CM) to explore this ability of reasoning after long-term memorization.  To alleviate the gradual forgetting of early information, we develop self-supervised memorization training with item-level and sequence-level objectives. We demonstrate several interesting characteristics of our continual memory via synthetic data, and evaluate its performance by several downstream tasks, including long-term text QA, long-term video QA and recommendation with long sequences.",
+    "title": "Continual Memory: Can We Reason After Long-Term Memorization?",
+    "authors": [
+      "Zhu Zhang",
+      "Chang Zhou",
+      "Zhou Zhao",
+      "Zhijie Lin",
+      "Jingren Zhou",
+      "Hongxia Yang"
+    ],
+    "emails": [
+      "~Jingren_Zhou1",
+      "~Zhu_Zhang3",
+      "~Zhou_Zhao2",
+      "~Zhijie_Lin1",
+      "~Hongxia_Yang2",
+      "~Chang_Zhou2"
+    ],
+    "rank": 1605
+  },
+  {
+    "url": "https://openreview.net/forum?id=-u4j4dHeWQi",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In reinforcement learning, a map with states and transitions built based on historical trajectories is often helpful in exploration and exploitation. Even so, learning and planning on such a map within a sparse environment remains a challenge. As a step towards this goal, we propose Graph Structured Reinforcement Learning (GSRL), which utilizes historical trajectories to slowly adjust exploration directions and learn related experiences while rapidly updating the value function estimation. GSRL constructs a dynamic graph on top of state transitions in the replay buffer based on historical trajectories, and develops an attention strategy on the map to select an appropriate goal direction, which decomposes the task of reaching a distant goal state into a sequence of easier tasks. We also leverage graph structure to sample related trajectories for efficient value learning. Results demonstrate that GSRL can outperform the state-of-the-art algorithms in terms of sample efficiency on benchmarks with sparse reward functions. ",
+    "title": "Explore with Dynamic Map: Graph Structured Reinforcement Learning",
+    "authors": [
+      "Jiarui Jin",
+      "Sijin Zhou",
+      "Weinan Zhang",
+      "Rasool Fakoor",
+      "David Wipf",
+      "Tong He",
+      "Yong Yu",
+      "Zheng Zhang",
+      "Alex Smola"
+    ],
+    "emails": [
+      "~Zheng_Zhang1",
+      "sjtu.edu.cn",
+      "~Alex_Smola1",
+      "~Rasool_Fakoor1",
+      "~Jiarui_Jin1",
+      "~Yong_Yu1",
+      "~David_Wipf1",
+      "~Tong_He5",
+      "~Weinan_Zhang1"
+    ],
+    "rank": 1606
+  },
+  {
+    "url": "https://openreview.net/forum?id=FUdBF49WRV1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.21",
+    "confidences": [
+      5,
+      2,
+      4,
+      3
+    ],
+    "abstract": "In order to overcome the expressive limitations of graph neural networks (GNNs), we propose the first method that exploits vector flows over graphs to develop globally consistent directional and asymmetric aggregation functions. \nWe show that our directional graph networks (DGNs) generalize convolutional neural networks (CNNs) when applied on a grid. Whereas recent theoretical works focus on understanding local neighbourhoods, local structures and local isomorphism with no global information flow, our novel theoretical framework allows directional convolutional kernels in any graph.\nFirst, by defining a vector field in the graph, we develop a method of applying directional derivatives and smoothing by projecting node-specific messages into the field. \nThen we propose the use of the Laplacian eigenvectors as such vector field, and we show that the method generalizes CNNs on an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>-dimensional grid, and is provably more discriminative than standard GNNs regarding the Weisfeiler-Lehman 1-WL test.\nFinally, we bring the power of CNN data augmentation to graphs by providing a means of doing reflection, rotation and distortion on the underlying directional field. We evaluate our method on different standard benchmarks and see a relative error reduction of 8% on the CIFAR10 graph dataset and 11% to 32% on the molecular ZINC dataset. An important outcome of this work is that it enables to translate any physical or biological problems with intrinsic directional axes into a graph network formalism with an embedded directional field. ",
+    "title": "Directional graph networks",
+    "authors": [
+      "Dominique Beaini",
+      "Saro Passaro",
+      "Vincent Letourneau",
+      "William L. Hamilton",
+      "Gabriele Corso",
+      "Pietro Li\u00f2"
+    ],
+    "emails": [
+      "cam.ac.uk",
+      "invivoai.com",
+      "~Pietro_Li\u00f21",
+      "~Dominique_Beaini1",
+      "~William_L._Hamilton1"
+    ],
+    "rank": 1607
+  },
+  {
+    "url": "https://openreview.net/forum?id=tkra4vFiFq",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      3
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Machine learning models based on Deep Neural Networks (DNNs) are increasingly being deployed in a wide range of applications ranging from self-driving cars to Covid-19 diagnostics. The computational power necessary to learn a DNN is non-trivial. So, as a result, cloud environments with dedicated hardware support emerged as important infrastructure. However, outsourcing computation to the cloud raises security, privacy, and integrity challenges. To address these challenges, previous works tried to leverage homomorphic encryption, secure multi-party computation, and trusted execution environments (TEE). Yet, none of these approaches can scale up to support realistic DNN model training workloads with deep architectures and millions of training examples without sustaining a significant performance hit. In this work, we focus on the setting where the integrity of the outsourced Deep Learning (DL) model training is ensured by TEE. We choose the TEE based approach because it has been shown to be more efficient compared to the pure cryptographic solutions, and the availability of TEEs on cloud environments. To mitigate the loss in performance, we combine random verification of selected computation steps with careful adjustments of DNN  used for training. Our experimental results show that the proposed approach may achieve 2X to 20X performance improvement compared to the pure TEE based solution while guaranteeing the integrity of the computation with high probability (e.g., 0.999) against the state-of-the-art DNN backdoor attacks.",
+    "title": "GINN: Fast GPU-TEE Based Integrity for Neural Network Training",
+    "authors": [
+      "Aref Asvadishirehjini",
+      "Murat Kantarcioglu",
+      "Bradley A. Malin"
+    ],
+    "emails": [
+      "~Murat_Kantarcioglu1",
+      "utdallas.edu",
+      "~Bradley_A._Malin1"
+    ],
+    "rank": 1608
+  },
+  {
+    "url": "https://openreview.net/forum?id=3LujMJM9EMp",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.21",
+    "confidences": [
+      2,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Estimating mutual information between continuous random variables is often intractable and extremely challenging for high-dimensional data. Recent progress has leveraged neural networks to optimize variational lower bounds on mutual information. Although showing promise for this difficult problem, the variational methods have been theoretically and empirically proven to have serious statistical limitations: 1) many methods struggle to produce accurate estimates when the underlying mutual information is either low or high; 2) the resulting estimators may suffer from high variance. Our approach is based on training a classifier that provides the probability that a data sample pair is drawn from the joint distribution rather than from the product of its marginal distributions. Moreover, we establish a direct connection between mutual information and the average log odds estimate produced by the classifier on a test set, leading to a simple and accurate estimator of mutual information. We show theoretically that our method and other variational approaches are equivalent when they achieve their optimum, while our method sidesteps the variational bound. Empirical results demonstrate high accuracy of our approach and the advantages of our estimator in the context of representation learning.\n",
+    "title": "DEMI: Discriminative Estimator of Mutual Information ",
+    "authors": [
+      "Ruizhi Liao",
+      "Daniel Moyer",
+      "Polina Golland",
+      "William M Wells"
+    ],
+    "emails": [
+      "~Ruizhi_Liao3",
+      "~William_M_Wells1",
+      "~Polina_Golland1",
+      "~Daniel_Moyer3"
+    ],
+    "rank": 1609
+  },
+  {
+    "url": "https://openreview.net/forum?id=R6tNszN_QfA",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      7
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We are interested in the design of generative networks. The training of these mathematical structures is mostly performed with the help of adversarial (min-max) optimization problems. We propose a simple methodology for constructing such problems assuring, at the same time, consistency of the corresponding solution. We give characteristic examples developed by our method, some of which can be recognized from other applications and some are introduced here for the first time. We compare various possibilities by applying them to well known datasets using neural networks of different configurations and sizes.",
+    "title": "Adversarial Problems for Generative Networks",
+    "authors": [
+      "Kalliopi Basioti",
+      "George V. Moustakides"
+    ],
+    "emails": [
+      "~Kalliopi_Basioti1",
+      "upatras.gr"
+    ],
+    "rank": 1610
+  },
+  {
+    "url": "https://openreview.net/forum?id=4xzY5yod28y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Stochastic gradient descent (SGD) algorithms, with constant momentum and its variants such as Adam, are the optimization methods of choice for training deep neural networks (DNNs). There is great interest in speeding up the convergence of these methods due to their high computational expense. Nesterov accelerated gradient (NAG) with a time-varying momentum, denoted as NAG below, improves the convergence rate of gradient descent (GD) for convex optimization using a specially designed momentum; however, it accumulates error when an inexact gradient is used (such as in SGD), slowing convergence at best and diverging at worst. In this paper, we propose scheduled restart SGD (SRSGD), a new NAG-style scheme for training DNNs. SRSGD replaces the constant momentum in SGD by the increasing momentum in NAG but stabilizes the iterations by resetting the momentum to zero according to a schedule. Using a variety of models and benchmarks for image classification, we demonstrate that, in training DNNs, SRSGD significantly improves convergence and generalization; for instance, in training ResNet-200 for ImageNet classification, SRSGD achieves an error rate of 20.93% vs. the benchmark of 22.13%. These improvements become more significant as the network grows deeper. Furthermore, on both CIFAR and ImageNet, SRSGD reaches similar or even better error rates with  significantly fewer training epochs compared to the SGD baseline.",
+    "title": "Scheduled Restart Momentum for Accelerated Stochastic Gradient Descent",
+    "authors": [
+      "Bao Wang",
+      "Tan Minh Nguyen",
+      "Tao Sun",
+      "Andrea Bertozzi",
+      "Richard Baraniuk",
+      "Stanley Osher"
+    ],
+    "emails": [
+      "~Tan_Minh_Nguyen1",
+      "~Tao_Sun7",
+      "~Stanley_Osher1",
+      "~Andrea_Bertozzi1",
+      "~Richard_Baraniuk1",
+      "~Bao_Wang1"
+    ],
+    "rank": 1611
+  },
+  {
+    "url": "https://openreview.net/forum?id=yfKOB5CO5dY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.21",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Meta-learning methods learn the meta-knowledge among various training tasks and aim to promote the learning of new tasks under the task similarity assumption. Such meta-knowledge is often represented as a fixed distribution; this, however, may be too restrictive to capture various specific task information because the discriminative patterns in the data may change dramatically across tasks. In this work, we aim to equip the meta learner with the ability to model and produce task-specific meta-knowledge and, accordingly, present a localized meta-learning framework based on the PAC-Bayes theory. In particular, we propose a Local Coordinate Coding (LCC) based prior predictor that allows the meta learner to generate local meta-knowledge for specific tasks adaptively. We further develop a practical algorithm with deep neural network based on the bound. Empirical results on real-world datasets demonstrate the efficacy of the proposed method.",
+    "title": "Localized Meta-Learning: A PAC-Bayes Analysis for Meta-Learning Beyond Global Prior",
+    "authors": [
+      "Chenghao Liu",
+      "Tao Lu",
+      "Doyen Sahoo",
+      "Yuan Fang",
+      "Kun Zhang",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Doyen_Sahoo1",
+      "~Yuan_Fang1",
+      "~Chenghao_Liu1",
+      "~Tao_Lu2",
+      "~Kun_Zhang1"
+    ],
+    "rank": 1612
+  },
+  {
+    "url": "https://openreview.net/forum?id=IMPnRXEWpvr",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      7,
+      5,
+      4
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Multi-task learning (MTL) has been widely used in representation learning. However, naively training all tasks simultaneously may lead to the partial training issue, where specific tasks are trained more adequately than others. In this paper, we propose to learn multiple tasks impartially. Specifically, for the task-shared parameters, we optimize the scaling factors via a closed-form solution, such that the aggregated gradient (sum of raw gradients weighted by the scaling factors) has equal projections onto individual tasks. For the task-specific parameters, we dynamically weigh the task losses so that all of them are kept at a comparable scale. Further, we find the above gradient balance and loss balance are complementary and thus propose a hybrid balance method to further improve the performance. Our impartial multi-task learning (IMTL) can be end-to-end trained without any heuristic hyper-parameter tuning, and is general to be applied on all kinds of losses without any distribution assumption. Moreover, our IMTL can converge to similar results even when the task losses are designed to have different scales, and thus it is scale-invariant. We extensively evaluate our IMTL on the standard MTL benchmarks including Cityscapes, NYUv2 and CelebA. It outperforms existing loss weighting methods under the same experimental settings.",
+    "title": "Towards Impartial Multi-task Learning",
+    "authors": [
+      "Liyang Liu",
+      "Yi Li",
+      "Zhanghui Kuang",
+      "Jing-Hao Xue",
+      "Yimin Chen",
+      "Wenming Yang",
+      "Qingmin Liao",
+      "Wayne Zhang"
+    ],
+    "emails": [
+      "~Wayne_Zhang2",
+      "~Yi_Li15",
+      "~Wenming_Yang1",
+      "~Qingmin_Liao1",
+      "~Jing-Hao_Xue1",
+      "~Yimin_Chen1",
+      "~Zhanghui_Kuang4",
+      "~Liyang_Liu1"
+    ],
+    "rank": 1613
+  },
+  {
+    "url": "https://openreview.net/forum?id=YPm0fzy_z6R",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      6,
+      4
+    ],
+    "rating": "5.21",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Given a signed social graph, how can we learn appropriate node representations to infer the signs of missing edges?\nSigned social graphs have received considerable attention to model trust relationships.\nLearning node representations is crucial to effectively analyze graph data, and various techniques such as network embedding and graph convolutional network (GCN) have been proposed for learning signed graphs.\nHowever, traditional network embedding methods are not end-to-end for a specific task such as link sign prediction, and GCN-based methods suffer from a performance degradation problem when their depth increases.\nIn this paper, we propose Signed Graph Diffusion Network (SGDNet), a novel graph neural network that achieves end-to-end node representation learning for link sign prediction in signed social graphs.\nWe propose a random walk technique specially designed for signed graphs so that SGDNet effectively diffuses hidden node features.\nThrough extensive experiments, we demonstrate that SGDNet outperforms state-of-the-art models in terms of link sign prediction accuracy. ",
+    "title": "Signed Graph Diffusion Network",
+    "authors": [
+      "Jinhong Jung",
+      "Jaemin Yoo",
+      "U Kang"
+    ],
+    "emails": [
+      "~Jinhong_Jung1",
+      "~Jaemin_Yoo1",
+      "~U_Kang1"
+    ],
+    "rank": 1614
+  },
+  {
+    "url": "https://openreview.net/forum?id=Bx05YH2W8bE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.21",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Hypergraph Convolutional Network (HCN) has become a default choice for capturing high-order relations among nodes, \\emph{i.e., } encoding the structure of a hypergraph. However, existing HCN models ignore the dynamic evolution of hypergraphs in the real-world scenarios, \\emph{i.e., } nodes and hyperedges in a hypergraph change dynamically over time. To capture the evolution of high-order relations and facilitate relevant analytic tasks, we formulate dynamic hypergraph and devise the Dynamic Hypergraph Convolution Networks (DyHCN). In general, DyHCN consists of a Hypergraph Convolution (HC) to encode the hypergraph structure at a time point and a Temporal Evolution module (TE) to capture the varying of the relations. The HC is delicately designed by equipping inner attention and outer attention, which adaptively aggregate nodes' features to hyperedge and estimate the importance of each hyperedge connected to the centroid node, respectively. Extensive experiments on the Tiigo and Stocktwits datasets show that DyHCN achieves superior performance over existing methods, which implies the effectiveness of capturing the property of dynamic hypergraphs by HC and TE modules.",
+    "title": "DyHCN: Dynamic Hypergraph Convolutional Networks",
+    "authors": [
+      "Nan Yin",
+      "zhigang luo",
+      "wenjie wang",
+      "Fuli Feng",
+      "Xiang Zhang"
+    ],
+    "emails": [
+      "~Nan_Yin1",
+      "gmail.com",
+      "~Xiang_Zhang7",
+      "nudt.edu.cn",
+      "~Fuli_Feng1"
+    ],
+    "rank": 1615
+  },
+  {
+    "url": "https://openreview.net/forum?id=cKnKJcTPRcV",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Graphs are the most ubiquitous form of structured data representation used in machine learning. They model, however, only pairwise relations between nodes and are not designed for encoding the higher-order relations found in many real-world datasets. To model such complex relations, hypergraphs have proven to be a natural representation. Learning the node representations in a hypergraph is more complex than in a graph as it involves information propagation at two levels: within every hyperedge and across the hyperedges. Most current approaches first transform a hypergraph structure to a graph for use in existing geometric deep learning algorithms. This transformation leads to information loss, and sub-optimal exploitation of the hypergraph's expressive power. We present HyperSAGE, a novel hypergraph learning framework that uses a two-level neural message passing strategy to accurately and efficiently propagate information through hypergraphs. The flexible design of HyperSAGE facilitates different ways of aggregating neighborhood information. Unlike the majority of related work which is transductive, our approach, inspired by the popular GraphSAGE method, is inductive. Thus, it can also be used on previously unseen nodes, facilitating deployment in problems such as evolving or partially observed hypergraphs. Through extensive experimentation, we show that HyperSAGE outperforms state-of-the-art hypergraph learning methods on representative benchmark datasets. We also demonstrate that the higher expressive power of HyperSAGE makes it more stable in learning node representations as compared to the alternatives. ",
+    "title": "HyperSAGE: Generalizing Inductive Representation Learning on Hypergraphs",
+    "authors": [
+      "Devanshu Arya",
+      "Deepak Gupta",
+      "Stevan Rudinac",
+      "Marcel Worring"
+    ],
+    "emails": [
+      "~Deepak_Gupta2",
+      "~Devanshu_Arya1",
+      "~Marcel_Worring1",
+      "uva.nl"
+    ],
+    "rank": 1616
+  },
+  {
+    "url": "https://openreview.net/forum?id=Kao09W-oe8",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.21",
+    "confidences": [
+      2,
+      5,
+      1,
+      3,
+      3
+    ],
+    "abstract": "We introduce optimization methods for convolutional neural networks that can be used to improve existing gradient-based optimization in terms of generalization error. The method requires only simple processing of existing stochastic gradients, can be used in conjunction with any optimizer, and has only a linear overhead (in the number of parameters) compared to computation of the stochastic gradient. The method works by computing the gradient of the loss function with respect to output-channel directed re-weighted L2 or Sobolev metrics, which has the effect of smoothing components of the gradient across a certain direction of the parameter tensor. We show that defining the gradients along the output channel direction leads to a performance boost, while other directions can be detrimental. We present the continuum theory of such gradients, its discretization, and application to deep networks. Experiments on benchmark datasets, several networks, and baseline optimizers show that optimizers can be improved in generalization error by simply computing the stochastic gradient with respect to output-channel directed metrics.",
+    "title": "Channel-Directed Gradients for Optimization of Convolutional Neural Networks",
+    "authors": [
+      "Dong Lao",
+      "Peihao Zhu",
+      "Peter Wonka",
+      "Ganesh Sundaramoorthi"
+    ],
+    "emails": [
+      "~Peter_Wonka2",
+      "~Dong_Lao1",
+      "~Peihao_Zhu1",
+      "~Ganesh_Sundaramoorthi1"
+    ],
+    "rank": 1617
+  },
+  {
+    "url": "https://openreview.net/forum?id=y6IlNbrKcwG",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.21",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Differentiable Architecture Search (DARTS) has attracted extensive attention due to its efficiency in searching for cell structures. However, DARTS mainly focuses on the operation search, leaving the cell topology implicitly depending on the searched operation weights. Hence, a problem is raised: can cell topology be well represented by the operation weights? The answer is negative because we observe that the operation weights fail to indicate the performance of cell topology. In this paper, we propose to Decouple the Operation and Topology Search (DOTS), which decouples the cell topology representation from the operation weights to make an explicit topology search. DOTS is achieved by defining an additional cell topology search space besides the original operation search space. Within the DOTS framework, we propose group annealing operation search and edge annealing topology search to bridge the optimization gap between the searched over-parameterized network and the derived child network. DOTS is efficient and only costs 0.2 and 1 GPU-day to search the state-of-the-art cell architectures on CIFAR and ImageNet, respectively. By further searching for the topology of DARTS' searched cell, we can improve DARTS' performance significantly. The code will be publicly available.",
+    "title": "DOTS: Decoupling Operation and Topology in Differentiable Architecture Search",
+    "authors": [
+      "Yuchao Gu",
+      "Yun Liu",
+      "Yi Yang",
+      "Yu-Huan Wu",
+      "Shao-Ping Lu",
+      "Ming-Ming Cheng"
+    ],
+    "emails": [
+      "~Shao-Ping_Lu1",
+      "~Yi_Yang4",
+      "~Ming-Ming_Cheng3",
+      "~Yuchao_Gu1",
+      "~Yun_Liu1",
+      "~Yu-Huan_Wu1"
+    ],
+    "rank": 1618
+  },
+  {
+    "url": "https://openreview.net/forum?id=QQzomPbSV7q",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.21",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Metric learning seeks perceptual embeddings where visually similar instances are close and dissimilar instances are apart, but learned representation can be sub-optimal when the distribution of intra-class samples is diverse and distinct sub-clusters are present. We theoretically prove and empirically show that under reasonable noise assumptions, prevalent embedding losses in metric learning, e.g., triplet loss, tend to project all samples of a class with various modes onto a single point in the embedding space, resulting in a class collapse that usually renders the space ill-sorted for classification or retrieval. To address this problem, we propose a simple modification to the embedding losses such that each sample selects its nearest same-class counterpart in a batch as the positive element in the tuple/triplet. This allows for the presence of multiple sub-clusters within each class. The adaptation can be integrated into a wide range of metric learning losses. Our method demonstrates clear benefits on various fine-grained image retrieval datasets over a variety of existing losses; qualitative retrieval results show that samples with similar visual patterns are indeed closer in the embedding space.\n",
+    "title": "Reducing Class Collapse in Metric Learning with Easy Positive Sampling",
+    "authors": [
+      "Elad Levi",
+      "Tete Xiao",
+      "Xiaolong Wang",
+      "trevor darrell"
+    ],
+    "emails": [
+      "~Xiaolong_Wang3",
+      "~trevor_darrell1",
+      "~Elad_Levi1",
+      "~Tete_Xiao1"
+    ],
+    "rank": 1619
+  },
+  {
+    "url": "https://openreview.net/forum?id=JzG0n48hRf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In addition to  achieving high accuracy, in many applications, it is important to  estimate the probability that a model prediction is correct. Predictive uncertainty is particularly important on out of distribution (OOD) data where accuracy degrades.  However, models are typically overconfident, and model calibration on OOD data remains a challenge. In this paper we propose a simple post hoc calibration method that significantly  improves on benchmark results [Ovadia et al 2019] on a wide range of corrupted data.  Our method uses outlier exposure to properly calibrate the model probabilities.",
+    "title": "Uncertainty for deep image classifiers on out of distribution data. ",
+    "authors": [
+      "Tiago Salvador",
+      "Alexander Iannantuono",
+      "Adam M Oberman"
+    ],
+    "emails": [
+      "~Adam_M_Oberman1",
+      "mail.mcgill.ca"
+    ],
+    "rank": 1620
+  },
+  {
+    "url": "https://openreview.net/forum?id=7qmQNB6Wn_B",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Policy entropy regularization is commonly used for better exploration in deep reinforcement learning (RL). However, policy entropy regularization is sample-inefficient in off-policy learning since it does not take the distribution of previous samples stored in the replay buffer into account. In order to take advantage of the previous sample distribution from the replay buffer for sample-efficient exploration, we propose sample-aware entropy regularization which maximizes the entropy of weighted sum of the policy action distribution and the sample action distribution from the replay buffer. We formulate the problem of sample-aware entropy regularized policy iteration,  prove its convergence, and provide a practical algorithm named diversity actor-critic (DAC) which is a generalization of soft actor-critic (SAC). Numerical results show that DAC significantly outperforms SAC baselines and other state-of-the-art RL algorithms.",
+    "title": "Diversity Actor-Critic: Sample-Aware Entropy Regularization for Sample-Efficient Exploration",
+    "authors": [
+      "Seungyul Han",
+      "Youngchul Sung"
+    ],
+    "emails": [
+      "~Seungyul_Han1",
+      "~Youngchul_Sung1"
+    ],
+    "rank": 1621
+  },
+  {
+    "url": "https://openreview.net/forum?id=JiNvAGORcMW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Representation learning on visualized input is an important yet challenging task for deep reinforcement learning (RL). The feature space learned from visualized input not only dominates the agent's generalization ability in new environments but also affect the data efficiency during training. To help the RL agent learn general and discriminative representation among various states, we present cross-state self-constraint(CSSC), a novel constraint that regularizes the representation feature space by comparing similarity of different pairs of representations. Based on the representation-behavior connection derived from the agent's experience, this constraint helps reinforce the general feature recognition during the learning process and thus enhance the generalization to unseen environment. We test our proposed method on the OpenAI ProcGen benchmark and see significant improvement on generalization performance across most of ProcGen games.",
+    "title": "Cross-State Self-Constraint for Feature Generalization in Deep Reinforcement Learning",
+    "authors": [
+      "Guan Ting Liu",
+      "Pu-Jen Cheng",
+      "GuanYu Lin"
+    ],
+    "emails": [
+      "~Guan_Ting_Liu1",
+      "~Pu-Jen_Cheng1",
+      "csie.ntu.edu.tw"
+    ],
+    "rank": 1622
+  },
+  {
+    "url": "https://openreview.net/forum?id=HWqv5Pm3E3",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we propose a novel domain adaptation method for the source-free setting. In this setting, we cannot access source data during adaptation, while unlabeled target data and a model pretrained with source data are given. Due to lack of source data, we cannot directly match the data distributions between domains unlike typical domain adaptation algorithms. To cope with this problem, we propose utilizing batch normalization statistics stored in the pretrained model to approximate the distribution of unobserved source data. Specifically, we fix the classifier part of the model during adaptation and only fine-tune the remaining feature encoder part so that batch normalization statistics of the features extracted by the encoder match those stored in the fixed classifier. Additionally, we also maximize the mutual information between the features and the classifier's outputs to further boost the classification performance. Experimental results with several benchmark datasets show that our method achieves competitive performance with state-of-the-art domain adaptation methods even though it does not require access to source data. ",
+    "title": "Source-free Domain Adaptation via Distributional Alignment by Matching Batch Normalization Statistics",
+    "authors": [
+      "Masato Ishii",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Masato_Ishii1",
+      "~Masashi_Sugiyama1"
+    ],
+    "rank": 1623
+  },
+  {
+    "url": "https://openreview.net/forum?id=SPyxaz_h9Nd",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Slimmable neural networks provide a flexible trade-off front between prediction error and computational cost (such as the number of floating-point operations or FLOPs) with the same storage cost as a single model. They have been proposed recently for resource-constrained settings such as mobile devices. However, current slimmable neural networks use a single width-multiplier for all the layers to arrive at sub-networks with different performance profiles, which neglects that different layers affect the network's prediction accuracy differently and have different FLOP requirements. Hence,  developing a principled approach for deciding width-multipliers across different layers could potentially improve the performance of slimmable networks. To allow for heterogeneous width-multipliers across different layers, we formulate the problem of optimizing slimmable networks from a multi-objective optimization lens, which leads to a novel algorithm for optimizing both the shared weights and the width-multipliers for the sub-networks. We perform extensive empirical analysis with 15 network and dataset combinations and two types of cost objectives, i.e., FLOPs and memory footprint, to demonstrate the effectiveness of the proposed method compared to existing alternatives. Quantitatively, improvements up to 1.7% and 8% in top-1 accuracy on the ImageNet dataset can be attained for MobileNetV2 considering FLOPs and memory footprint, respectively. Our results highlight the potential of optimizing the channel counts for different layers jointly with the weights for slimmable networks.",
+    "title": "PareCO: Pareto-aware Channel Optimization for Slimmable Neural Networks",
+    "authors": [
+      "Rudy Chin",
+      "Ari S. Morcos",
+      "Diana Marculescu"
+    ],
+    "emails": [
+      "~Rudy_Chin2",
+      "~Diana_Marculescu4",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 1624
+  },
+  {
+    "url": "https://openreview.net/forum?id=ufS1zWbRCEa",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      9,
+      6,
+      3
+    ],
+    "rating": "5.20",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Deep learning models trained on large data sets have been widely successful in both vision and language domains. As state-of-the-art deep learning architectures have continued to grow in parameter count so have the compute budgets and times required to train them, increasing the need for compute-efficient methods that parallelize training. Two common approaches to parallelize the training of deep networks have been data and model parallelism. While useful, data and model parallelism suffer from diminishing returns in terms of compute efficiency for large batch sizes. In this paper, we investigate how to continue scaling compute efficiently beyond the point of diminishing returns for large batches through local parallelism, a framework which parallelizes training of individual layers in deep networks by replacing global backpropagation with truncated layer-wise backpropagation. Local parallelism enables fully asynchronous layer-wise parallelism with a low memory footprint, and requires little communication overhead compared with model parallelism. We show results in both vision and language domains across a diverse set of architectures, and find that local parallelism is particularly effective in the high-compute regime. ",
+    "title": "Parallel Training of Deep Networks with Local Updates",
+    "authors": [
+      "Michael Laskin",
+      "Luke Metz",
+      "Seth Nabarro",
+      "Mark Saroufim",
+      "Badreddine Noune",
+      "Carlo Luschi",
+      "Jascha Sohl-Dickstein",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Jascha_Sohl-Dickstein2",
+      "~Luke_Metz1",
+      "~Michael_Laskin1",
+      "graphcore.ai",
+      "~Pieter_Abbeel2"
+    ],
+    "rank": 1625
+  },
+  {
+    "url": "https://openreview.net/forum?id=dnKsslWzLNY",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.20",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "With the continuously increasing number of quantum bits in quantum computers, there are growing interests in exploring applications that can harvest the power of them. Recently, several attempts were made to implement neural networks, known to be computationally intensive, in hybrid quantum-classical scheme computing. While encouraging results are shown, two fundamental questions need to be answered: (1) whether neural networks in hybrid quantum-classical computing can leverage quantum power and meanwhile approximate any function within a given error bound, i.e., universal approximability; (2) how do these neural networks compare with ones on a classical computer in terms of representation power? This work sheds light on these two questions from a theoretical perspective.",
+    "title": "On the Universal Approximability and Complexity Bounds of Deep Learning in Hybrid Quantum-Classical Computing",
+    "authors": [
+      "Weiwen Jiang",
+      "Yukun Ding",
+      "Yiyu Shi"
+    ],
+    "emails": [
+      "~Yiyu_Shi1",
+      "~Yukun_Ding1",
+      "~Weiwen_Jiang1"
+    ],
+    "rank": 1626
+  },
+  {
+    "url": "https://openreview.net/forum?id=j5d9qacxdZa",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We motivate Energy-Based Models (EBMs) as a promising model class for continual learning problems. Instead of tackling continual learning via the use of external memory, growing models, or regularization, EBMs have a natural way to support a dynamically-growing number of tasks and classes and less interference with old tasks. We show that EBMs are adaptable to a more general continual learning setting where the data distribution changes without the notion of explicitly delineated tasks. We also find that EBMs outperform the baseline methods by a large margin on several continual learning benchmarks. These observations point towards EBMs as a class of models naturally inclined towards the continual learning regime.",
+    "title": "Energy-Based Models for Continual Learning",
+    "authors": [
+      "Shuang Li",
+      "Yilun Du",
+      "Gido Martijn van de Ven",
+      "Antonio Torralba",
+      "Igor Mordatch"
+    ],
+    "emails": [
+      "~Gido_Martijn_van_de_Ven1",
+      "~Antonio_Torralba1",
+      "~Shuang_Li5",
+      "~Igor_Mordatch4",
+      "~Yilun_Du1"
+    ],
+    "rank": 1627
+  },
+  {
+    "url": "https://openreview.net/forum?id=-kigPjfTIGd",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      3,
+      6,
+      3,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      5,
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Current state-of-the-art generative models for videos have high computational requirements that impede high resolution generations beyond a few frames. In this work we propose a stage-wise strategy to train Generative Adversarial Networks (GANs) for videos.  We decompose the generative process to first produce a downsampled video that is then spatially upscaled and temporally interpolated by subsequent stages. Upsampling stages are applied locally on temporal chunks of previous outputs to manage the computational complexity. Stages are defined as Generative Adversarial Networks, which are trained sequentially and independently. We validate our approach on Kinetics-600 and BDD100K, for which we train a three stage model capable of generating 128x128 videos with 100 frames.",
+    "title": "SSW-GAN: Scalable Stage-wise Training of Video GANs",
+    "authors": [
+      "Lluis Castrejon",
+      "Nicolas Ballas",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Lluis_Castrejon1",
+      "~Aaron_Courville3",
+      "~Nicolas_Ballas1"
+    ],
+    "rank": 1628
+  },
+  {
+    "url": "https://openreview.net/forum?id=Mub9VkGZoZe",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.20",
+    "confidences": [
+      2,
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "How to learn a good representation of data is one of the most important topics of machine learning.  Disentanglement of representations, though believed to be the core feature of good representations, has caused a lot of debates and discussions in recent. Sorrenson et al. (2020), using the techniques developed in nonlinear independent analysis theory, show that general incompressible-flow networks (GIN) can recover the underlying latent variables that generate the data, and thus can provide a compact and disentangled representation. However, in this paper, we point out that the method taken by GIN for informative latent variables identification is not theoretically supported and can be disproved by experiments.  We propose to use the mutual information between latent variables and the auxiliary variable to correctly identify informative latent variables.  We directly verify the improvement brought by our method in experiments on synthetic data.  We further show the advantage of our method on various downstream tasks including classification, outlier detection and adversarial attack defence.",
+    "title": "Identifying Informative Latent Variables Learned by GIN via Mutual Information",
+    "authors": [
+      "Chen Zhang",
+      "Yitong Sun",
+      "Mingtian Zhang"
+    ],
+    "emails": [
+      "~Chen_Zhang6",
+      "~Mingtian_Zhang1",
+      "~Yitong_Sun1"
+    ],
+    "rank": 1629
+  },
+  {
+    "url": "https://openreview.net/forum?id=cbtV7xGO9pS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      7
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      3,
+      2,
+      1
+    ],
+    "abstract": "Trust region methods and maximum entropy methods are two state-of-the-art branches used in reinforcement learning (RL) for the benefits of stability and exploration in continuous environments, respectively. This paper proposes to integrate both branches in a unified framework, thus benefiting from both sides. We first transform the original RL objective to a constraint optimization problem and then proposes trust entropy actor-critic (TEAC), an off-policy algorithm to learn stable and sufficiently explored policies for continuous states and actions. TEAC trains the critic by minimizing the refined Bellman error and updates the actor by minimizing KL-divergence loss derived from the closed-form solution to the Lagrangian. \nWe prove that the policy evaluation and policy improvement in TEAC is guaranteed to converge.\nWe compare TEAC with 4 state-of-the-art solutions on 6 tasks in the MuJoCo environment. The results show that TEAC outperforms state-of-the-art solutions in terms of efficiency and effectiveness.\n",
+    "title": "TEAC: Intergrating Trust Region and Max Entropy Actor Critic for Continuous Control",
+    "authors": [
+      "Hongyu Zang",
+      "Xin Li",
+      "Li Zhang",
+      "Peiyao Zhao",
+      "Mingzhong Wang"
+    ],
+    "emails": [
+      "~Li_Zhang18",
+      "~Peiyao_Zhao1",
+      "~Mingzhong_Wang1",
+      "~Hongyu_Zang1",
+      "~Xin_Li31"
+    ],
+    "rank": 1630
+  },
+  {
+    "url": "https://openreview.net/forum?id=FOR2VqgJXb",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7
+    ],
+    "rating": "5.20",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "We consider the problem of evaluating representations of data for use in solving a downstream task. We propose to measure the quality of a representation by the complexity of learning a predictor on top of the representation that achieves low loss on a task of interest. To this end, we introduce two measures: surplus description length (SDL) and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container> sample complexity (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>SC). To compare our methods to prior work, we also present a framework based on plotting the validation loss versus dataset size (the \"loss-data\" curve). Existing measures, such as mutual information and minimum description length, correspond to slices and integrals along the data-axis of the loss-data curve, while ours correspond to slices and integrals along the loss-axis. This analysis shows that prior methods measure properties of an evaluation dataset of a specified size, whereas our methods measure properties of a predictor with a specified loss. We conclude with experiments on real data to compare the behavior of these methods over datasets of varying size.",
+    "title": "Evaluating representations by the complexity of learning low-loss predictors",
+    "authors": [
+      "William F Whitney",
+      "Min Jae Song",
+      "David Brandfonbrener",
+      "Jaan Altosaar",
+      "Kyunghyun Cho"
+    ],
+    "emails": [
+      "~Kyunghyun_Cho1",
+      "~Jaan_Altosaar1",
+      "~David_Brandfonbrener1",
+      "~Min_Jae_Song1",
+      "~William_F_Whitney1"
+    ],
+    "rank": 1631
+  },
+  {
+    "url": "https://openreview.net/forum?id=jxdXSW9Doc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we study the statistical properties of distributed kernel ridge regression together with random features (DKRR-RF), and obtain optimal generalization bounds under the basic setting, which can substantially relax the restriction on the number of local machines in the existing state-of-art bounds. Specifically, we first show that the simple combination of divide-and-conquer technique and random features can achieve the same statistical accuracy as the exact KRR in expectation requiring only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> memory and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow><msup><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mrow><mn>1.5</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> time. Then, beyond the generalization bounds in expectation that demonstrate the average information for multiple trails, we derive generalization bounds in probability to capture the learning performance for a single trail. Finally, we propose an effective communication strategy to further improve the performance of DKRR-RF, and validate the theoretical bounds via numerical experiments.",
+    "title": "Effective Distributed Learning with Random Features: Improved Bounds and Algorithms",
+    "authors": [
+      "Yong Liu",
+      "Jiankun Liu",
+      "Shuqiang Wang"
+    ],
+    "emails": [
+      "iie.ac.cn",
+      "ruc.edu.cn",
+      "siat.ac.cn"
+    ],
+    "rank": 1632
+  },
+  {
+    "url": "https://openreview.net/forum?id=RuUdMAU-XbI",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "One practice of employing deep neural networks is to apply the same architecture to all the input instances. However, a fixed architecture may not be representative enough for data with high diversity. To promote the model capacity, existing approaches usually employ larger convolutional kernels or deeper network structure, which may increase the computational cost. In this paper, we address this issue by raising the Dynamic Graph Network (DG-Net). The network learns the instance-aware connectivity, which creates different forward paths for different instances. Specifically, the network is initialized as a complete directed acyclic graph, where the nodes represent convolutional blocks and the edges represent the connection paths. We generate edge weights by a learnable module \\textit{router} and select the edges whose weights are larger than a threshold, to adjust the connectivity of the neural network structure. Instead of using the same path of the network, DG-Net aggregates features dynamically in each node, which allows the network to have more representation ability. To facilitate the training, we represent the network connectivity of each sample in an adjacency matrix. The matrix is updated to aggregate features in the forward pass, cached in the memory, and used for gradient computing in the backward pass.  We verify the effectiveness of our method with several static architectures, including MobileNetV2, ResNet, ResNeXt, and RegNet. Extensive experiments are performed on ImageNet classification and COCO object detection, which shows the effectiveness and generalization ability of our approach.",
+    "title": "Dynamic Graph: Learning Instance-aware Connectivity for Neural Networks",
+    "authors": [
+      "Kun Yuan",
+      "Quanquan Li",
+      "Dapeng Chen",
+      "Aojun Zhou",
+      "Junjie Yan"
+    ],
+    "emails": [
+      "~Kun_Yuan1",
+      "~Quanquan_Li1",
+      "~Junjie_Yan1",
+      "~Aojun_Zhou2",
+      "~Dapeng_Chen4"
+    ],
+    "rank": 1633
+  },
+  {
+    "url": "https://openreview.net/forum?id=RSn0s-T-qoy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.20",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Learning effective representations for data with multiple views is crucial in machine learning and pattern recognition. Recently great efforts have focused on learning unified or latent representations to integrate information from different views for specific tasks. These approaches generally assume simple or implicit relationships between different views and as a result are not able to flexibly and explicitly depict the correlations among these views. To address this, we firstly propose the definition and conditions for multi-view disentanglement providing general instructions for disentangling representations between different views. Furthermore, a novel objective function is derived to explicitly disentangle the multi-view data into a shared part across different views and a (private) exclusive part within each view. Experiments on a variety of multi-modal datasets demonstrate that our objective can effectively disentangle information from different views while satisfying the disentangling conditions.",
+    "title": "Multi-View Disentangled Representation",
+    "authors": [
+      "Zongbo Han",
+      "Changqing Zhang",
+      "Huazhu Fu",
+      "Qinghua Hu",
+      "Joey Tianyi Zhou"
+    ],
+    "emails": [
+      "~Changqing_Zhang1",
+      "~Zongbo_Han1",
+      "~Joey_Tianyi_Zhou1",
+      "~Huazhu_Fu4",
+      "~Qinghua_Hu1"
+    ],
+    "rank": 1634
+  },
+  {
+    "url": "https://openreview.net/forum?id=JAlqRs9duhz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.19",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Advanced large-scale neural language models have led to significant success in many natural language generation tasks. However, the most commonly used training objective, Maximum Likelihood Estimation (MLE), has been shown to be problematic, where the trained model prefers using dull and repetitive phrases. In this work, we introduce ScaleGrad, a modification straight to the gradient of the loss function, to remedy the degeneration issues of the standard MLE objective. By directly maneuvering the gradient information, ScaleGrad makes the model learn to use novel tokens during training. Empirical results show the effectiveness of our method not only in open-ended generation, but also in directed generation. With the simplicity in architecture, our method can serve as a general training objective that is applicable to most of the neural text generation tasks.",
+    "title": "Straight to the Gradient: Learning to Use Novel Tokens for Neural Text Generation",
+    "authors": [
+      "Xiang Lin",
+      "SIMENG HAN",
+      "Shafiq Joty"
+    ],
+    "emails": [
+      "~SIMENG_HAN1",
+      "~Shafiq_Joty1",
+      "~Xiang_Lin2"
+    ],
+    "rank": 1635
+  },
+  {
+    "url": "https://openreview.net/forum?id=in2qzBZ-Vwr",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      7,
+      5
+    ],
+    "rating": "5.19",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Learning to detect an object in an image from very few training examples - few-shot object detection - is challenging, because the classifier that sees proposal boxes has very little training data.  A particularly challenging training regime occurs when there are one or two training examples. In this case, if the region proposal network (RPN) misses even one high intersection-over-union (IOU) training box, the classifier's model of how object appearance varies can be severely impacted. We use multiple distinct yet cooperating RPN's.  Our RPN's are trained to be different, but not too different; doing so yields significant performance improvements over state of the art for COCO and PASCAL VOC in the very few-shot setting. This effect appears to be independent of the choice of classifier or dataset.  \n",
+    "title": "Cooperating RPN's Improve Few-Shot Object Detection",
+    "authors": [
+      "Weilin Zhang",
+      "Yu-Xiong Wang",
+      "David Forsyth"
+    ],
+    "emails": [
+      "~David_Forsyth1",
+      "~Weilin_Zhang1",
+      "~Yu-Xiong_Wang1"
+    ],
+    "rank": 1636
+  },
+  {
+    "url": "https://openreview.net/forum?id=lSijhyKKsct",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      3,
+      7
+    ],
+    "rating": "5.19",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Temporal information is essential to learning effective policies with Reinforcement Learning (RL). However, current state-of-the-art RL algorithms either assume that such information is given as part of the state space or, when learning from pixels, use the simple heuristic of frame-stacking to implicitly capture temporal information present in the image observations. This heuristic is in contrast to the current paradigm in video classification architectures, which utilize explicit encodings of temporal information through methods such as optical flow and two-stream architectures to achieve state-of-the-art performance. Inspired by leading video classification architectures, we introduce the Flow of Latents for Reinforcement Learning Flare, a network architecture for RL that explicitly encodes temporal information through latent vector differences. We show that Flare (i) recovers optimal performance in state-based RL without explicit access to the state velocity, solely with positional state information, (ii) achieves state-of-the-art performance on pixel-based continuous control tasks within the DeepMind control benchmark suite, (iii) is the most sample efficient model-free pixel-based RL algorithm on challenging environments in the DeepMind control suite such as quadruped walk, hopper hop, finger turn hard, pendulum swing, and walker run, outperforming the prior model-free state-of-the-art by 1.9 and 1.5 on the 500k and 1M step benchmarks, respectively, and (iv), when augmented over rainbow DQN, outperforms or matches the baseline on a diversity of challenging Atari games at 50M time step benchmark.",
+    "title": "Reinforcement Learning with Latent Flow",
+    "authors": [
+      "Wenling Shang",
+      "Xiaofei Wang",
+      "Aravind Rajeswaran",
+      "Aravind Srinivas",
+      "Yang Gao",
+      "Pieter Abbeel",
+      "Michael Laskin"
+    ],
+    "emails": [
+      "~Aravind_Srinivas1",
+      "~Michael_Laskin1",
+      "~Aravind_Rajeswaran1",
+      "~Wenling_Shang1",
+      "berkeley.edu",
+      "~Pieter_Abbeel2",
+      "~Yang_Gao1"
+    ],
+    "rank": 1637
+  },
+  {
+    "url": "https://openreview.net/forum?id=yvzMA5im3h",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.19",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Graph attention networks (GATs) have been recognized as powerful tools for learning in graph structured data. However, how to enable the attention mechanisms in GATs to smoothly consider both structural and feature information is still very challenging. In this paper, we propose Graph Joint Attention Networks (JATs) to address the aforementioned challenge. Different from previous attention-based graph neural networks (GNNs), JATs adopt novel joint attention mechanisms which can automatically determine the relative significance between node features and structural coefficients learned from graph subspace, when computing the attention scores. Therefore, representations concerning more structural properties can be inferred by JATs.\nBesides, we theoretically analyze the expressive power of JATs and further propose an improved strategy for the joint attention mechanisms that enables JATs to reach the upper bound of expressive power which every message-passing GNN can ultimately achieve, i.e., 1-WL test. JATs can thereby be seen as most powerful message-passing GNNs. The proposed neural architecture has been extensively tested on widely used benchmarking datasets, including Cora, Cite, and Pubmed and has been compared with state-of-the-art GNNs for node classification tasks. Experimental results show that JATs achieve state-of-the-art performance on all the testing datasets.",
+    "title": "Graph Joint Attention Networks",
+    "authors": [
+      "Tiantian He",
+      "Lu Bai",
+      "Yew-Soon Ong"
+    ],
+    "emails": [
+      "ntu.edu.sg",
+      "~Tiantian_He1",
+      "~Yew-Soon_Ong1"
+    ],
+    "rank": 1638
+  },
+  {
+    "url": "https://openreview.net/forum?id=KWToR-Phbrz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.19",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Explainability of machine learning models has gained considerable attention within our research community given the importance of deploying more reliable machine-learning systems. Explanability can also be helpful for model debugging. In computer vision applications, most methods explain models by displaying the regions in the input image that they focus on for their prediction, but it is difficult to improve models based on these explanations since they do not indicate why the model fail. Counterfactual methods, on the other hand, indicate how to perturb the input to change the model prediction, providing details about the model's decision-making. Unfortunately, current counterfactual methods make ambiguous interpretations as they combine multiple biases of the model and the data in a single counterfactual interpretation of the model's decision. Moreover, these methods tend to generate trivial counterfactuals about the model's decision, as they often suggest to exaggerate or remove the presence of the attribute being classified. Trivial counterfactuals are usually not valuable, since the information they provide is often already known to the system's designer. In this work, we propose a counterfactual method that learns a perturbation in a disentangled latent space that is constrained using a diversity-enforcing loss to uncover multiple valuable explanations about the model's prediction. Further, we introduce a mechanism to prevent the model from producing trivial explanations. Experiments on CelebA and Synbols demonstrate that our model improves the success rate of producing high-quality valuable explanations when compared to previous state-of-the-art methods. We will make the code public.",
+    "title": "Beyond Trivial Counterfactual Generations with Diverse Valuable Explanations",
+    "authors": [
+      "Pau Rodriguez",
+      "Massimo Caccia",
+      "Alexandre Lacoste",
+      "Lee Zamparo",
+      "Issam H. Laradji",
+      "Laurent Charlin",
+      "David Vazquez"
+    ],
+    "emails": [
+      "~Alexandre_Lacoste1",
+      "~Issam_H._Laradji1",
+      "~Massimo_Caccia1",
+      "~Pau_Rodriguez2",
+      "~David_Vazquez1",
+      "~Lee_Zamparo1",
+      "~Laurent_Charlin1"
+    ],
+    "rank": 1639
+  },
+  {
+    "url": "https://openreview.net/forum?id=VYfotZsQV5S",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.19",
+    "confidences": [
+      5,
+      4,
+      3,
+      1,
+      3
+    ],
+    "abstract": "Many constrained, nonconvex and nonsmooth optimization problems can be tackled using the majorization-minimization (MM) method which alternates between constructing a surrogate function which upper bounds the objective function, and then minimizing this surrogate. For problems which minimize a finite sum of functions, a stochastic version of the MM method selects a batch of functions at random at each iteration and optimizes the accumulated surrogate.\nHowever, in many cases of interest such as variational inference for latent variable models, the surrogate functions are expressed as an expectation. In this contribution, we propose a doubly stochastic MM method based on Monte Carlo approximation of these stochastic surrogates.\nWe establish asymptotic and non-asymptotic convergence of our scheme in a constrained, nonconvex, nonsmooth optimization setting. We apply our new framework for inference of logistic regression model with missing data and for variational inference of Bayesian variants of LeNet-5 and Resnet-18 on respectively the MNIST and CIFAR-10 datasets.",
+    "title": "MISSO: Minimization by Incremental Stochastic Surrogate Optimization for Large Scale Nonconvex and Nonsmooth Problems",
+    "authors": [
+      "Belhal Karimi",
+      "Hoi To Wai",
+      "Eric Moulines",
+      "Ping Li"
+    ],
+    "emails": [
+      "~Ping_Li3",
+      "~Eric_Moulines1",
+      "~Hoi_To_Wai1",
+      "~Belhal_Karimi1"
+    ],
+    "rank": 1640
+  },
+  {
+    "url": "https://openreview.net/forum?id=KBWK5Y92BRh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.19",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Existing neural architecture search (NAS) methods often return an architecture with good search performance but generalizes poorly to the test setting. To achieve better generalization, we propose a novel neighborhood-aware NAS formulation to identify flat-minima architectures in the search space, with the assumption that flat minima generalize better than sharp minima. The phrase ``flat-minima architecture'' refers to architectures whose performance is stable under small perturbations in the architecture (\\emph{e.g.}, replacing a convolution with a skip connection). Our formulation takes the ``flatness'' of an architecture into account by aggregating the performance over the neighborhood of this architecture. We demonstrate a principled way to apply our formulation to existing search algorithms, including sampling-based algorithms and gradient-based algorithms. To facilitate the application to gradient-based algorithms, we also propose a differentiable representation for the neighborhood of architectures. Based on our formulation, we propose neighborhood-aware random search (NA-RS) and neighborhood-aware differentiable architecture search (NA-DARTS). Notably, by simply augmenting DARTS~\\cite{liu2018darts} with our formulation, NA-DARTS finds architectures that perform better or on par with those found by state-of-the-art NAS methods on established benchmarks, including CIFAR-10, CIAFR-100 and ImageNet.",
+    "title": "Neighborhood-Aware Neural Architecture Search",
+    "authors": [
+      "Xiaofang Wang",
+      "Shengcao Cao",
+      "Mengtian Li",
+      "Kris M. Kitani"
+    ],
+    "emails": [
+      "~Kris_M._Kitani1",
+      "~Mengtian_Li1",
+      "~Xiaofang_Wang1",
+      "~Shengcao_Cao1"
+    ],
+    "rank": 1641
+  },
+  {
+    "url": "https://openreview.net/forum?id=3rRgu7OGgBI",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      4,
+      4
+    ],
+    "rating": "5.19",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "It is common within the deep learning community to first pre-train a deep neural network from a large-scale dataset and then fine-tune the pre-trained model to a specific downstream task. Recently, both supervised and unsupervised pre-training approaches to learning representations have achieved remarkable advances, which exploit the discriminative knowledge of labels and the intrinsic structure of data, respectively. It follows natural intuition that both discriminative knowledge and intrinsic structure of the downstream task can be useful for fine-tuning, however, existing fine-tuning methods mainly leverage the former and discard the latter. A question arises: How to fully explore the intrinsic structure of data for boosting fine-tuning? In this paper, we propose Bi-tuning, a general learning framework to fine-tuning both supervised and unsupervised pre-trained representations to downstream tasks. Bi-tuning generalizes the vanilla fine-tuning by integrating two heads upon the backbone of pre-trained representations: a classifier head with an improved contrastive cross-entropy loss to better leverage the label information in an instance-contrast way, and a projector head with a newly-designed categorical contrastive learning loss to fully exploit the intrinsic structure of data in a category-consistent way. Comprehensive experiments confirm that Bi-tuning achieves state-of-the-art results for fine-tuning tasks of both supervised and unsupervised pre-trained models by large margins (e.g.~10.7\\% absolute rise in accuracy on CUB in low-data regime).",
+    "title": "Bi-tuning of Pre-trained Representations",
+    "authors": [
+      "Jincheng Zhong",
+      "Ximei Wang",
+      "Zhi Kou",
+      "Jianmin Wang",
+      "Mingsheng Long"
+    ],
+    "emails": [
+      "~Ximei_Wang1",
+      "mails.tsinghua.edu.cn",
+      "~Jianmin_Wang1",
+      "gmail.com",
+      "~Mingsheng_Long5"
+    ],
+    "rank": 1642
+  },
+  {
+    "url": "https://openreview.net/forum?id=X5ivSy4AHx",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.19",
+    "confidences": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Min-max optimization captures many important machine learning problems such as robust adversarial learning and inverse reinforcement learning, and nonconvex-strongly-concave min-max optimization has been an active line of research. Specifically, a novel variance reduction algorithm SREDA was proposed recently by (Luo et al. 2020) to solve such a problem, and was shown to achieve the optimal complexity dependence on the required accuracy level <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>. Despite the superior theoretical performance, the convergence guarantee of SREDA requires stringent initialization accuracy and an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-dependent stepsize for controlling the per-iteration progress, so that SREDA can run very slowly in practice. This paper develops a novel analytical framework that guarantees the SREDA's optimal complexity performance for a much enhanced algorithm SREDA-Boost, which has less restrictive initialization requirement and an accuracy-independent (and much bigger) stepsize. Hence, SREDA-Boost runs substantially faster in experiments than SREDA. We further apply SREDA-Boost to propose a zeroth-order variance reduction algorithm named ZO-SREDA-Boost for the scenario that has access only to the information about function values not gradients, and show that ZO-SREDA-Boost outperforms the best known complexity dependence on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>. This is the first study that applies the variance reduction technique to zeroth-order algorithm for min-max optimization problems.",
+    "title": "Enhanced First and Zeroth Order Variance Reduced Algorithms for Min-Max Optimization",
+    "authors": [
+      "Tengyu Xu",
+      "Zhe Wang",
+      "Yingbin Liang",
+      "H. Vincent Poor"
+    ],
+    "emails": [
+      "~Zhe_Wang4",
+      "~Yingbin_Liang1",
+      "~Tengyu_Xu1",
+      "~H._Vincent_Poor1"
+    ],
+    "rank": 1643
+  },
+  {
+    "url": "https://openreview.net/forum?id=pRGF3Jtaie",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      4,
+      6,
+      3
+    ],
+    "rating": "5.18",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Machine learning models are being used extensively in many important areas, but there is no guarantee that a model will always perform well or as its developers intended. Understanding the correctness of a model is crucial to prevent potential failures that may have significant detrimental impact in critical application areas. In this paper, we propose a novel framework to efficiently test a machine learning model using only a small amount of labelled test data. The core idea is to efficiently estimate the metrics of interest for a model-under-test using Bayesian neural network. We develop a methodology to efficiently train the Bayesian neural network from the limited number of labelled data. We also devise an entropy-based sampling strategy to sample the data point such that the proposed framework can give accurate estimations for the metrics of interest. Finally, we conduct an extensive set of experiments to test various machine learning models for different types of metrics. Our experiments with multiple datasets show that given a testing budget, the estimation of the metrics by our method is significantly better compared to existing state-of-the-art approaches.",
+    "title": "ALT-MAS: A Data-Efficient Framework for Active Testing of Machine Learning Algorithms",
+    "authors": [
+      "Huong Ha",
+      "Sunil Gupta",
+      "Santu Rana",
+      "Svetha Venkatesh"
+    ],
+    "emails": [
+      "~Santu_Rana1",
+      "~Huong_Ha3",
+      "~Sunil_Gupta2",
+      "~Svetha_Venkatesh1"
+    ],
+    "rank": 1644
+  },
+  {
+    "url": "https://openreview.net/forum?id=ESVGfJM9a7",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      4,
+      4
+    ],
+    "rating": "5.18",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Forecasting events occurring in space and time is a fundamental problem. Existing neural point process models are only temporal and are limited in spatial inference. We propose a family of deep sequence models that integrate spatiotemporal point processes with deep neural networks. Our novel Neural Spatiotemporal Point Process model is flexible, efficient,  and can accurately predict irregularly sampled events.  The key construction of our approach is based on space-time separation of temporal intensity function and time-conditioned spatial density function, which is approximated by kernel density estimation. We validate our model on the synthetic spatiotemporal Hawkes process and self-correcting process. On many benchmark spatiotemporal event forecasting datasets, our model demonstrates superior performances. To the best of our knowledge, this is the first neural point process model that can jointly predict the continuous space and time of events. ",
+    "title": "Neural Point Process for Forecasting Spatiotemporal Events",
+    "authors": [
+      "Zihao Zhou",
+      "Xingyi Yang",
+      "Xinyi He",
+      "Ryan Rossi",
+      "Handong Zhao",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Xinyi_He1",
+      "~Xingyi_Yang1",
+      "~Zihao_Zhou1",
+      "~Rose_Yu1",
+      "~Handong_Zhao3",
+      "~Ryan_Rossi1"
+    ],
+    "rank": 1645
+  },
+  {
+    "url": "https://openreview.net/forum?id=E3SWxn0cDBG",
+    "ratings": [
+      6,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.18",
+    "confidences": [
+      2,
+      4,
+      2,
+      3
+    ],
+    "abstract": "Incorporating a so-called momentum dynamic in gradient descent methods is widely used in neural net training as it has been broadly observed that, at least empirically, it often leads to significantly faster convergence. At the same time, there are very few theoretical guarantees in the literature to explain this apparent acceleration effect. In this paper we show that Polyak's momentum, in combination with over-parameterization of the model,  helps achieve faster convergence in training a one-layer ReLU network on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> examples.  We show specifically that gradient descent with Polyak's momentum decreases the initial training error at a rate much faster than that of vanilla gradient descent. We provide a bound for a fixed sample size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>, and we show that gradient descent with Polyak's momentum converges at an accelerated rate to a small error that is controllable by the number of neurons <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container>. Prior work (Du et al. 2019) showed that using vanilla gradient descent, and with a similar method of over-parameterization,  the error decays as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-msub space=\"3\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mn>1</mn><mo>\u2212</mo><msub><mi>\u03ba</mi><mi>n</mi></msub><msup><mo stretchy=\"false\">)</mo><mi>t</mi></msup></math></mjx-assistive-mml></mjx-container> after <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container> iterations, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u03ba</mi><mi>n</mi></msub></math></mjx-assistive-mml></mjx-container> is a problem-specific parameter. Our result shows that with the appropriate choice of momentum parameter one has a rate of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-msqrt space=\"3\"><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.208em;\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-box></mjx-sqrt></mjx-msqrt><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mn>1</mn><mo>\u2212</mo><msqrt><msub><mi>\u03ba</mi><mi>n</mi></msub></msqrt><msup><mo stretchy=\"false\">)</mo><mi>t</mi></msup></math></mjx-assistive-mml></mjx-container>. This work establishes that momentum does indeed speed up neural net training.\n",
+    "title": "Provable Acceleration of Wide Neural Net Training via Polyak's Momentum",
+    "authors": [
+      "Jun-Kun Wang",
+      "Jacob Abernethy"
+    ],
+    "emails": [
+      "~Jun-Kun_Wang1",
+      "~Jacob_Abernethy1"
+    ],
+    "rank": 1646
+  },
+  {
+    "url": "https://openreview.net/forum?id=nG4Djb4h8Re",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4
+    ],
+    "rating": "5.18",
+    "confidences": [
+      5,
+      3,
+      3
+    ],
+    "abstract": "There are large individual differences in physiological processes, making designing personalized health sensing algorithms challenging. Existing machine learning systems struggle to generalize well to unseen subjects or contexts, especially in video-based physiological measurement. Although fine-tuning for a user might address this issue, it is difficult to collect large sets of training data for specific individuals because supervised algorithms require medical-grade sensors for generating the training target. Therefore, learning personalized or customized models from a small number of unlabeled samples is very attractive as it would allow fast calibrations. In this paper, we present a novel meta-learning approach called MetaPhys for learning personalized cardiac signals from 18-seconds of video data. MetaPhys works in both supervised and unsupervised manners. We evaluate our proposed approach on two benchmark datasets and demonstrate superior performance in cross-dataset evaluation with substantial reductions (42% to 44%) in errors compared with state-of-the-art approaches. Visualization of attention maps and ablation experiments reveal how the model adapts to each subject and why our proposed approach leads to these improvements. We have also demonstrated our proposed method significantly helps reduce the bias in skin type.  \n",
+    "title": "MetaPhys: Few-Shot Adaptation for Non-Contact Physiological Measurement",
+    "authors": [
+      "Xin Liu",
+      "Ziheng Jiang",
+      "Joshua Wolff Fromm",
+      "Xuhai Xu",
+      "Shwetak Patel",
+      "Daniel McDuff"
+    ],
+    "emails": [
+      "~Xuhai_Xu1",
+      "~Daniel_McDuff1",
+      "~Shwetak_Patel1",
+      "~Xin_Liu8",
+      "~Joshua_Wolff_Fromm1",
+      "~Ziheng_Jiang1"
+    ],
+    "rank": 1647
+  },
+  {
+    "url": "https://openreview.net/forum?id=qcKh_Msv1GP",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.18",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Graph motifs are significant subgraph patterns occurring frequently in graphs, and they play important roles in representing the whole graph characteristics. For example, in the chemical domain, functional groups are motifs that can determine molecule properties. Mining and utilizing motifs, however, is a non-trivial task for large graph datasets. Traditional motif discovery approaches mostly rely on exact counting or statistical estimation, which are hard to scale for a large number of graphs with continuous and high-dimension features. In light of the significance and challenges of motif mining, we propose : MICRO-Graph: a framework for \\underline{M}ot\\underline{I}f-driven \\underline{C}ontrastive lea\\underline{R}ning \\underline{O}f \\underline{G}raph representations to: 1) pre-train Graph Neural Networks (GNNs) in a self-supervised manner to automatically extract graph motifs from large graph datasets; 2) leverage learned motifs to guide the contrastive learning of graph representations, which further benefit various graph downstream tasks. Specifically, given a graph dataset, a motif learner cluster similar and significant subgraphs into corresponding motif slots. Based on the learned motifs, a motif-guided subgraph segmenter is trained to generate more informative subgraphs, which are used to conduct graph-to-subgraph contrastive learning of GNNs. Our discovering strategy is to simutaneously do clustering and contrastive learning on dynamically sampled subgraphs. The clustering part pull together similar subgraphs across different whole graphs, as the contrastive part push away dissimilar ones. Meanwhile, our learnable sampler will generate subgraph samples better aligned with the discoverying procedure. By pre-training on ogbn-molhiv molecule dataset with our proposed MICRO-Graph, the pre-trained GNN model can enhance various chemical property prediction downstream tasks with scarce label by 2.0%, and significantly higher than other state-of-the-art self-supervised learning baselines. \n",
+    "title": "Motif-Driven Contrastive Learning of Graph Representations",
+    "authors": [
+      "Shichang Zhang",
+      "Ziniu Hu",
+      "Arjun Subramonian",
+      "Yizhou Sun"
+    ],
+    "emails": [
+      "~Shichang_Zhang2",
+      "ucla.edu",
+      "~Ziniu_Hu1",
+      "~Yizhou_Sun1"
+    ],
+    "rank": 1648
+  },
+  {
+    "url": "https://openreview.net/forum?id=WUNF4WVPvMy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.18",
+    "confidences": [
+      2,
+      4,
+      1,
+      2,
+      2
+    ],
+    "abstract": "    We further research on the acceleration phenomenon on Riemannian manifolds by introducing the first global first-order method that achieves the same rates as accelerated gradient descent in the Euclidean space for the optimization of smooth and geodesically convex (g-convex) or strongly g-convex functions defined on the hyperbolic space or a subset of the sphere, up to constants and log factors. To the best of our knowledge, this is the first method that is proved to achieve these rates globally on functions defined on a Riemannian manifold <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4D TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">M</mi></mrow></math></mjx-assistive-mml></mjx-container> other than the Euclidean space.\n    Additionally, for any Riemannian manifold of bounded sectional curvature, we provide reductions from optimization methods for smooth and g-convex functions to methods for smooth and strongly g-convex functions and vice versa.",
+    "title": "Acceleration in Hyperbolic and Spherical Spaces",
+    "authors": [
+      "David Mart\u00ednez-Rubio"
+    ],
+    "emails": [
+      "~David_Mart\u00ednez-Rubio2"
+    ],
+    "rank": 1649
+  },
+  {
+    "url": "https://openreview.net/forum?id=JCRblSgs34Z",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      3,
+      5,
+      8
+    ],
+    "rating": "5.18",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "In deep neural networks, the spectral norm of the Jacobian of a layer bounds the factor by which the norm of a signal changes during forward/backward propagation. Spectral norm regularizations have been shown to improve generalization, robustness and optimization of deep learning methods. Existing methods to compute the spectral norm of convolution layers either rely on heuristics that are efficient in computation but lack guarantees or are theoretically-sound but computationally expensive. In this work, we obtain the best of both worlds by deriving {\\it four} provable upper bounds on the spectral norm of a standard 2D multi-channel convolution layer. These bounds are differentiable and can be computed efficiently during training with negligible overhead. One of these bounds is in fact the popular heuristic method of Miyato et al. (multiplied by a constant factor depending on filter sizes). Each of these four bounds can achieve the tightest gap depending on convolution filters. Thus, we propose to use the minimum of these four bounds as a tight, differentiable and efficient upper bound on the spectral norm of convolution layers. Moreover, our spectral bound is an effective regularizer and can be used to bound either the lipschitz constant or curvature values (eigenvalues of the Hessian) of neural networks. Through experiments on MNIST and CIFAR-10, we demonstrate the effectiveness of our spectral bound in improving generalization and robustness of deep networks.",
+    "title": "Fantastic Four: Differentiable and Efficient Bounds on Singular Values of Convolution Layers",
+    "authors": [
+      "Sahil Singla",
+      "Soheil Feizi"
+    ],
+    "emails": [
+      "~Soheil_Feizi2",
+      "~Sahil_Singla1"
+    ],
+    "rank": 1650
+  },
+  {
+    "url": "https://openreview.net/forum?id=wta_8Hx2KD",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      4,
+      7
+    ],
+    "rating": "5.18",
+    "confidences": [
+      2,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Recent work has shown deep learning can accelerate the prediction of physical dynamics relative to numerical solvers. However, limited physical accuracy and an inability to generalize under distributional shift limit its applicability to the real world. We propose to improve accuracy and generalization by incorporating symmetries into convolutional neural networks. Specifically, we employ a variety of methods each tailored to enforce a different symmetry. Our models are both theoretically and experimentally robust to distributional shift by symmetry group transformations and enjoy favorable sample complexity. We demonstrate the advantage of our approach on a variety of physical dynamics including Rayleigh\u2013B\u00e9nard convection and real-world ocean currents and temperatures. Compare with image or text applications, our work is a significant step towards applying equivariant neural networks to high-dimensional systems with complex dynamics.",
+    "title": "Incorporating Symmetry into Deep Dynamics Models for Improved Generalization",
+    "authors": [
+      "Rui Wang",
+      "Robin Walters",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Rose_Yu1",
+      "~Rui_Wang11",
+      "~Robin_Walters1"
+    ],
+    "rank": 1651
+  },
+  {
+    "url": "https://openreview.net/forum?id=2ioNazs6lvw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      3
+    ],
+    "rating": "5.18",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Optimal transport is a notoriously difficult problem to solve numerically, with current approaches often remaining intractable for very large scale applications such as those encountered in machine learning. Wasserstein barycenters -- the problem of finding measures in-between given input measures in the optimal transport sense -- is even more computationally demanding. \nBy training a deep convolutional neural network, we improve by a factor of 60 the computational speed of Wasserstein barycenters over the fastest state-of-the-art approach on the GPU, resulting in milliseconds computational times on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>512</mn><mo>\u00d7</mo><mn>512</mn></math></mjx-assistive-mml></mjx-container> regular grids.\nWe show that our network, trained on Wasserstein barycenters of pairs of measures, generalizes well to the problem of finding Wasserstein barycenters of more than two measures. We validate our approach on synthetic shapes generated via Constructive Solid Geometry as well as on the ``Quick, Draw'' sketches dataset.",
+    "title": "Learning to generate Wasserstein barycenters",
+    "authors": [
+      "Julien Lacombe",
+      "Julie Digne",
+      "Nicolas Courty",
+      "Nicolas Bonneel"
+    ],
+    "emails": [
+      "~Nicolas_Courty1",
+      "~Julie_Digne1",
+      "~Nicolas_Bonneel1",
+      "~Julien_Lacombe1"
+    ],
+    "rank": 1652
+  },
+  {
+    "url": "https://openreview.net/forum?id=MP0LhG4YiiC",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      3
+    ],
+    "rating": "5.18",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Children acquire language subconsciously by observing the surrounding world and listening to descriptions. They can discover the meaning of words even without explicit language knowledge, and generalize to novel compositions effortlessly. In this paper, we bring this ability to AI, by studying the task of multimodal compositional generalization within the context of visually grounded language acquisition. We propose a multimodal transformer model augmented with a novel mechanism for analogical reasoning, which approximates novel compositions by learning semantic mapping and reasoning operations from previously seen compositions.  Our proposed method, Analogical Reasoning Transformer Networks (ARTNet), is trained on raw multimedia data (video frames and transcripts), and after observing a set of compositions such as \"washing apple\" or \"cutting carrot\", it can generalize and recognize new compositions in new video frames, such as \"washing carrot\" or \"cutting apple\". To this end, ARTNet refers to relevant instances in the training data and uses their visual features and captions to establish analogies with the query image. Then it chooses a suitable verb and noun to create a new composition that describes the new image best. Extensive experiments on an instructional video dataset demonstrate that the proposed method achieves significantly better generalization capability and recognition accuracy compared to state-of-the-art transformer models.",
+    "title": "Analogical Reasoning for Visually Grounded Compositional Generalization",
+    "authors": [
+      "Bo Wu",
+      "Haoyu Qin",
+      "Alireza Zareian",
+      "Carl Vondrick",
+      "Shih-Fu Chang"
+    ],
+    "emails": [
+      "~Haoyu_Qin1",
+      "~Shih-Fu_Chang3",
+      "~Alireza_Zareian2",
+      "~Bo_Wu6",
+      "~Carl_Vondrick2"
+    ],
+    "rank": 1653
+  },
+  {
+    "url": "https://openreview.net/forum?id=mPmCP2CXc7p",
+    "decision": "Reject",
+    "ratings": [
+      9,
+      4,
+      3,
+      4
+    ],
+    "rating": "5.18",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "In many machine learning tasks, input features with varying degrees of predictive capability are usually acquired at some cost. For example, in human activity recognition (HAR) and mobile health (mHealth) applications, monitoring performance should be achieved with a low cost to gather different sensory features, as maintaining sensors incur monetary, computation, and energy cost. We propose an adaptive feature selection method that dynamically selects features for prediction at any given time point. We formulate this problem as an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> minimization problem across time, and cast the combinatorial optimization problem into a stochastic optimization formulation. We then utilize a differentiable relaxation to make the problem amenable to gradient-based optimization. Our evaluations on four activity recognition datasets show that our method achieves a favorable trade-off between performance and the number of features used. Moreover, the dynamically selected features of our approach are shown to be interpretable and associated with the actual activity types.",
+    "title": "Dynamic Feature Selection for Efficient and Interpretable Human Activity Recognition",
+    "authors": [
+      "Randy Ardywibowo",
+      "Shahin Boluki",
+      "Zhangyang Wang",
+      "Bobak J Mortazavi",
+      "Shuai Huang",
+      "Xiaoning Qian"
+    ],
+    "emails": [
+      "~Randy_Ardywibowo1",
+      "~Shuai_Huang1",
+      "~Zhangyang_Wang1",
+      "~Shahin_Boluki1",
+      "~Xiaoning_Qian2",
+      "~Bobak_J_Mortazavi1"
+    ],
+    "rank": 1654
+  },
+  {
+    "url": "https://openreview.net/forum?id=HN77M0Sdnp2",
+    "ratings": [
+      4,
+      7,
+      4,
+      6
+    ],
+    "rating": "5.18",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "It is commonly believed that networks cannot be both accurate and robust, that gaining robustness means losing accuracy. It is also generally believed that, unless making networks larger, network architectural elements would otherwise matter little in improving adversarial robustness. Here we present evidence to challenge these common beliefs by a careful study about adversarial training. Our key observation is that the widely-used ReLU activation function significantly weakens adversarial training due to its non-smooth nature. Hence we propose smooth adversarial training (SAT), in which we replace ReLU with its smooth approximations to strengthen adversarial training. The purpose of smooth activation functions in SAT is to allow it to find harder adversarial examples and compute better gradient updates during adversarial training. Compared to standard adversarial training, SAT improves adversarial robustness for \"free\", i.e., no drop in accuracy and no increase in computational cost. For example, without introducing additional computations, SAT significantly enhances ResNet-50's robustness from 33.0% to 42.3%, while also improving accuracy by 0.9% on ImageNet. SAT also works well with larger networks: it helps EfficientNet-L1 to achieve 82.2% accuracy and 58.6% robustness on ImageNet, outperforming the previous state-of-the-art defense by 9.5% for accuracy and 11.6% for robustness.",
+    "title": "Smooth Adversarial Training",
+    "authors": [
+      "cihang xie",
+      "Mingxing Tan",
+      "Boqing Gong",
+      "Alan Yuille",
+      "Quoc V Le"
+    ],
+    "emails": [
+      "~Quoc_V_Le1",
+      "~Mingxing_Tan3",
+      "~cihang_xie1",
+      "~Boqing_Gong1",
+      "~Alan_Yuille1"
+    ],
+    "rank": 1655
+  },
+  {
+    "url": "https://openreview.net/forum?id=T6RYeudzf1",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.17",
+    "confidences": [
+      4,
+      2,
+      4,
+      2
+    ],
+    "abstract": "We present a novel approach to the challenging problem of label-free text style transfer. Unlike previous approaches that use parallel or non-parallel labeled data, our technique removes the need for labels entirely, relying instead on the implicit connection in style between adjacent sentences in unlabeled text. We show that T5 (Raffel et al., 2020), a strong pretrained text-to-text model, can be adapted to extract a style vector from arbitrary text and use this vector to condition the decoder to perform style transfer. As the resulting learned style vector space encodes many facets of textual style, we recast transfers as \"targeted restyling\" vector operations that adjust specific attributes of the input text while preserving others. When trained over unlabeled Amazon reviews data, our resulting TextSETTR model is competitive on sentiment transfer, even when given only four exemplars of each class. Furthermore, we demonstrate that a single model trained on unlabeled Common Crawl data is capable of transferring along multiple dimensions including dialect, emotiveness, formality, politeness, and sentiment. ",
+    "title": "TextSETTR: Label-Free Text Style Extraction and Tunable Targeted Restyling",
+    "authors": [
+      "Parker Riley",
+      "Noah Constant",
+      "Mandy Guo",
+      "Girish Kumar",
+      "David Uthus",
+      "Zarana Parekh"
+    ],
+    "emails": [
+      "~Parker_Riley1",
+      "~Noah_Constant1",
+      "~Girish_Kumar1",
+      "~Zarana_Parekh2",
+      "~David_Uthus1",
+      "~Mandy_Guo2"
+    ],
+    "rank": 1656
+  },
+  {
+    "url": "https://openreview.net/forum?id=Lnomatc-1s",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4
+    ],
+    "rating": "5.17",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "We study learning-based sketching for Hessians, which is known to provide considerable speedups to second order optimization. A number of works have shown how to sketch or subsample the Hessian to speed up each iteration, but such sketches are usually specific to the matrix at hand, rather than being learned from a distribution. We extend such schemes to learned sketches, where we learn different potentially different sketches for the different iterations, and show empirically that learned sketches, compared with their \"non-learned\" counterparts, improve the approximation accuracy for a large number of important problems, including LASSO, SVM, and matrix estimation with nuclear norm constraints. ",
+    "title": "Learning-Augmented Sketches for Hessians",
+    "authors": [
+      "Yi Li",
+      "Honghao Lin",
+      "David Woodruff"
+    ],
+    "emails": [
+      "~Yi_Li8",
+      "~David_Woodruff2",
+      "~Honghao_Lin1"
+    ],
+    "rank": 1657
+  },
+  {
+    "url": "https://openreview.net/forum?id=CYHMIhbuLFl",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.17",
+    "confidences": [
+      4,
+      3,
+      3,
+      2
+    ],
+    "abstract": "While deep learning has obtained state-of-the-art results in many applications, the adaptation of neural network architectures to incorporate new features remains a research challenge. This issue is particularly severe in online learning settings, where new features are added continually with few or no associated observations. As such, methods for adapting neural networks to novel features which are both time and data-efficient are desired. To address this, we propose the Contextual HyperNetwork (CHN), which predicts the network weights associated with new features by incorporating information from both existing data as well as the few observations for the new feature and any associated feature metadata. At prediction time, the CHN requires only a single forward pass through a small neural network, yielding a significant speed-up when compared to re-training and fine-tuning approaches. In order to showcase the performance of CHNs, in this work we use a CHN to augment a partial variational autoencoder (P-VAE), a flexible deep generative model which can impute the values of missing features in sparsely-observed data. We show that this system obtains significantly improved performance for novel feature adaptation over existing imputation and meta-learning baselines across recommender systems, e-learning, and healthcare tasks.",
+    "title": "Contextual HyperNetworks  for Novel Feature Adaptation",
+    "authors": [
+      "Angus Lamb",
+      "Evgeny Saveliev",
+      "Yingzhen Li",
+      "Sebastian Tschiatschek",
+      "Camilla Longden",
+      "Simon Woodhead",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
+      "Richard E Turner",
+      "Pashmina Cameron",
+      "Cheng Zhang"
+    ],
+    "emails": [
+      "~Cheng_Zhang1",
+      "~Sebastian_Tschiatschek1",
+      "microsoft.com",
+      "eedi.co.uk",
+      "~Richard_E_Turner1",
+      "~Angus_Lamb1",
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1",
+      "gmail.com",
+      "~Pashmina_Cameron1",
+      "~Yingzhen_Li1"
+    ],
+    "rank": 1658
+  },
+  {
+    "url": "https://openreview.net/forum?id=88_MfcJoJlS",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6
+    ],
+    "rating": "5.17",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": " Solving sparse reward tasks through exploration is one of the major challenges in deep reinforcement learning, especially in three-dimensional, partially-observable environments. Critically, the algorithm proposed in this article uses a single human demonstration to solve hard-exploration problems. We train an agent on a combination of demonstrations and own experience to solve  problems with variable initial conditions. We adapt this idea and integrate it with the proximal policy optimization (PPO). The agent is able to increase its performance and to tackle harder problems by replaying its own past trajectories prioritizing them based on the obtained reward and the maximum value of the trajectory.\nWe compare variations of this algorithm to different imitation learning algorithms on a set of hard-exploration tasks in the Animal-AI Olympics environment.\nTo the best of our knowledge, learning a task in a three-dimensional environment with comparable difficulty has never been considered before using only one human demonstration.",
+    "title": "Guided Exploration with Proximal Policy Optimization using a Single Demonstration",
+    "authors": [
+      "Gabriele Libardi",
+      "Gianni De Fabritiis"
+    ],
+    "emails": [
+      "~Gabriele_Libardi1",
+      "~Gianni_De_Fabritiis1"
+    ],
+    "rank": 1659
+  },
+  {
+    "url": "https://openreview.net/forum?id=lU3Te8xLYR",
+    "ratings": [
+      3,
+      6,
+      6,
+      6
+    ],
+    "rating": "5.17",
+    "confidences": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Deep learning is currently one prominent approach for image denoising, and most of existing works train a denoising neural network (NN) on many pairs of noisy images and their clean counterparts. Recent studies showed that it is possible to train a denoising NN on a dataset consisting of only noisy images. This paper took one step further to study how to train a powerful denoising NN for a given image without any training samples, which is appealing to the applications where collecting training samples is challenging. For instance, biological imaging and medical imaging.\nBuilt on the Bayesian neural network (BNN), this paper proposed a self-supervised deep learning method for denoising a single image, in the absence of training samples. The experiments showed that the performance of our self-supervised method is very competitive to those state-of-the-art supervised ones.",
+    "title": "Self-supervised Bayesian Deep Learning for Image Denoising",
+    "authors": [
+      "Tongyao Pang",
+      "Yuhui Quan",
+      "Hui Ji"
+    ],
+    "emails": [
+      "~Tongyao_Pang1",
+      "~Hui_Ji1",
+      "~Yuhui_Quan5"
+    ],
+    "rank": 1660
+  },
+  {
+    "url": "https://openreview.net/forum?id=dN_iVr6iNuU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      4
+    ],
+    "rating": "5.17",
+    "confidences": [
+      3,
+      4,
+      4,
+      1
+    ],
+    "abstract": "The first deep RL algorithm, DQN, was limited by the overestimation bias of the learned Q-function. Subsequent algorithms proposed techniques to reduce this problem, without fully eliminating it. Recently, the Maxmin and Ensemble Q-learning algorithms used the different estimates provided by ensembles of learners to reduce the bias. Unfortunately, these learners can converge to the same point in the parametric or representation space, falling back to the classic single neural network DQN. In this paper, we describe a regularization technique to maximize diversity in the representation space in these algorithms. We propose and compare five regularization functions inspired from economics theory and consensus optimization. We show that the resulting approach significantly outperforms the Maxmin and Ensemble Q-learning algorithms as well as non-ensemble baselines.",
+    "title": "Preventing Value Function Collapse in Ensemble  Q-Learning by Maximizing Representation Diversity",
+    "authors": [
+      "Hassam Sheikh",
+      "Ladislau Boloni"
+    ],
+    "emails": [
+      "~Ladislau_Boloni1",
+      "~Hassam_Sheikh1"
+    ],
+    "rank": 1661
+  },
+  {
+    "url": "https://openreview.net/forum?id=PsdsEbzxZWr",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5
+    ],
+    "rating": "5.17",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "Generative adversarial training (GAT) is a recently introduced adversarial defense method. Previous works have focused on empirical evaluations of its application to training robust predictive models. In this paper we focus on theoretical understanding of the GAT method and extending its application to generative modeling and out-of-distribution detection. We analyze the optimal solutions of the maximin formulation employed by the GAT objective, and make a comparative analysis of the minimax formulation employed by GANs. We use theoretical analysis and 2D simulations to understand the convergence property of the training algorithm. Based on these results, we develop an unconstrained GAT algorithm, and conduct comprehensive evaluations of the algorithm's application to image generation and adversarial out-of-distribution detection. Our results suggest that generative adversarial training is a promising new direction for the above applications.",
+    "title": "Analyzing and Improving Generative Adversarial Training for Generative Modeling and Out-of-Distribution Detection",
+    "authors": [
+      "Xuwang Yin",
+      "Shiying li",
+      "Gustavo Rohde"
+    ],
+    "emails": [
+      "~Gustavo_Rohde1",
+      "virginia.edu",
+      "~Xuwang_Yin2"
+    ],
+    "rank": 1662
+  },
+  {
+    "url": "https://openreview.net/forum?id=Dw8vAUKYq8C",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.17",
+    "confidences": [
+      2,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Hard visual attention is a promising approach to reduce the computational burden of modern computer vision methodologies. Hard attention mechanisms are typically non-differentiable. They can be trained with reinforcement learning but the high-variance training this entails hinders more widespread application. We show how hard attention for image classification can be framed as a Bayesian optimal experimental design (BOED) problem. From this perspective, the optimal locations to attend to are those which provide the greatest expected reduction in the entropy of the classification distribution. We introduce methodology from the BOED literature to approximate this optimal behaviour, and use it to generate `near-optimal' sequences of attention locations. We then show how to use such sequences to partially supervise, and therefore speed up, the training of a hard attention mechanism. Although generating these sequences is computationally expensive, they can be reused by any other networks later trained on the same task.",
+    "title": "Near-Optimal Glimpse Sequences for Training Hard Attention Neural Networks",
+    "authors": [
+      "William Harvey",
+      "Michael Teng",
+      "Frank Wood"
+    ],
+    "emails": [
+      "~Frank_Wood2",
+      "~Michael_Teng1",
+      "~William_Harvey1"
+    ],
+    "rank": 1663
+  },
+  {
+    "url": "https://openreview.net/forum?id=moKjka2UOC",
+    "ratings": [
+      5,
+      5,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.17",
+    "confidences": [
+      5,
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Domain adaptation enhances generalizability of a model across domains with domain shifts. Most research effort has been spent on Unsupervised Domain Adaption (UDA) which trains a model jointly with labeled source data and unlabeled target data. This paper studies how much it can help address domain shifts if we further have a few target samples (e.g., one sample per class) labeled. This is the so-called semi-supervised domain adaptation (SSDA) problem and the few labeled target samples are termed as ``landmarks''. To explore the full potential of landmarks, we incorporate a prototypical alignment (PA) module which calculates a target prototype for each class from the landmarks; source samples are then aligned with the target prototype from the same class. To further alleviate label scarcity, we propose a data augmentation based solution. Specifically, we severely perturb the labeled images, making PA non-trivial to achieve and thus promoting model generalizability. Moreover, we apply consistency learning on unlabeled target images, by perturbing each image with light transformations and strong transformations. Then, the strongly perturbed image can enjoy ``supervised-like'' training using the pseudo label inferred from the lightly perturbed one. Experiments show that the proposed method, though simple, reaches significant performance gains over state-of-the-art methods, and enjoys the flexibility of being able to serve as a plug-and-play component to various existing UDA methods and improve adaptation performance with landmarks provided. ",
+    "title": "Semi-supervised Domain Adaptation with Prototypical Alignment and Consistency Learning",
+    "authors": [
+      "Kai Li",
+      "Chang Liu",
+      "Handong Zhao",
+      "Yulun Zhang",
+      "Yun Fu"
+    ],
+    "emails": [
+      "~Chang_Liu13",
+      "~Yulun_Zhang1",
+      "~Handong_Zhao3",
+      "~Yun_Fu1",
+      "~Kai_Li3"
+    ],
+    "rank": 1664
+  },
+  {
+    "url": "https://openreview.net/forum?id=sr68jSUakP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5
+    ],
+    "rating": "5.17",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Face  clustering  is  an  important  task,  due  to  its  wide  applications  in  practice. Graph-based  face  clustering  methods  have  recently  made  a  great  progress  and achieved new state-of-the-art results. Learning discriminative node features is the key to further improve the performance of graph-based face clustering.  To this end, we propose subspace learning as a new way to learn discriminative node features, which is implemented by a new orthogonal subspace decomposition (OSD) module. In graph-based face clustering, OSD leads to more discriminative node features,  which better reflect the relationship between each pair of faces, thereby boosting the accuracy of face clustering.  Extensive experiments show that OSD outperforms state-of-the-art results with a healthy margin.",
+    "title": "Orthogonal Subspace Decomposition: A New Perspective of Learning Discriminative Features for Face Clustering",
+    "authors": [
+      "Jianfeng Wang",
+      "Thomas Lukasiewicz",
+      "zhongchao shi"
+    ],
+    "emails": [
+      "~zhongchao_shi1",
+      "~Thomas_Lukasiewicz2",
+      "~Jianfeng_Wang2"
+    ],
+    "rank": 1665
+  },
+  {
+    "url": "https://openreview.net/forum?id=heFdS9_tkzc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.16",
+    "confidences": [
+      3,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Distant supervision is widely used in relation extraction in order to create a large-scale training dataset by aligning a knowledge base with unstructured text. Most existing studies in this field have assumed there is a great deal of centralized unstructured text. However, in practice, text may be distributed on different platforms and cannot be centralized due to privacy restrictions. Therefore, it is worthwhile to investigate distant supervision in the federated learning paradigm, which decouples the training of the model from the need for direct access to the raw text. However, overcoming label noise of distant supervision becomes more difficult in federated settings, because the sentences containing the same entity pair scatter around different platforms. In this paper, we propose a federated denoising framework to suppress label noise in federated settings. The core of this framework is a multiple instance learning based denoising method that is able to select reliable sentences via cross-platform collaboration. Various experimental results on New York Times dataset and miRNA gene regulation relation dataset demonstrate the effectiveness of the proposed method.",
+    "title": "Distantly Supervised Relation Extraction in Federated Settings",
+    "authors": [
+      "Dianbo Sui",
+      "Yubo Chen",
+      "Kang Liu",
+      "Jun Zhao"
+    ],
+    "emails": [
+      "~Kang_Liu1",
+      "~Yubo_Chen1",
+      "~Jun_Zhao4",
+      "~Dianbo_Sui1"
+    ],
+    "rank": 1666
+  },
+  {
+    "url": "https://openreview.net/forum?id=cYr2OPNyTz7",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.16",
+    "confidences": [
+      4,
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Masked Language Model (MLM) framework has been widely adopted for self-supervised language pre-training.  In this paper, we argue that randomly sampled masks in MLM would lead to undesirably large gradient variance. Thus, we theoretically quantify the gradient variance via correlating the gradient covariance with the Hamming distance between two different masks (given a certain text sequence). To reduce the variance due to the sampling of masks, we propose a fully-explored masking strategy, where a text sequence is divided into a certain number of non-overlapping segments. Thereafter, the tokens within one segment are masked for training. We prove, from a theoretical perspective, that the gradients derived from this new masking schema have a smaller variance and can lead to more efficient self-supervised training. We conduct extensive experiments on both continual pre-training and general pre-training from scratch. Empirical results confirm that this new masking strategy can consistently outperform standard random masking. Detailed efficiency analysis and ablation studies further validate the advantages of our fully-explored masking strategy under the MLM framework.",
+    "title": "Improving Self-supervised Pre-training via a Fully-Explored Masked Language Model",
+    "authors": [
+      "Mingzhi Zheng",
+      "Dinghan Shen",
+      "yelong shen",
+      "Weizhu Chen",
+      "Lin Xiao"
+    ],
+    "emails": [
+      "~Lin_Xiao1",
+      "~yelong_shen1",
+      "~Mingzhi_Zheng1",
+      "~Weizhu_Chen1",
+      "~Dinghan_Shen1"
+    ],
+    "rank": 1667
+  },
+  {
+    "url": "https://openreview.net/forum?id=eBHq5irt-tk",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Neural networks appear to have mysterious generalization properties when using parameter counting as a proxy for complexity. Indeed, neural networks often have many more parameters than there are data points, yet still provide good generalization performance. Moreover, when we measure generalization as a function of parameters, we see double descent behaviour, where the test error decreases, increases, and then again decreases. We show that many of these properties become understandable when viewed through the lens of effective dimensionality, which measures the dimensionality of the parameter space determined by the data. We relate effective dimensionality to posterior contraction in Bayesian deep learning, model selection, width-depth tradeoffs, double descent, and functional diversity in loss surfaces,  leading to a richer understanding of the interplay between parameters and functions in deep models. We also show that effective dimensionality compares favourably to alternative norm- and flatness- based generalization measures.",
+    "title": "Rethinking Parameter Counting: Effective Dimensionality Revisited",
+    "authors": [
+      "Gregory Benton",
+      "Wesley Maddox",
+      "Andrew Gordon Wilson"
+    ],
+    "emails": [
+      "~Wesley_Maddox1",
+      "~Andrew_Gordon_Wilson1",
+      "~Gregory_Benton1"
+    ],
+    "rank": 1668
+  },
+  {
+    "url": "https://openreview.net/forum?id=TJSOfuZEd1B",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "While large-scale language models (LMs) are able to imitate the distribution of natural language well enough to generate realistic text, it is difficult to control which regions of the distribution they generate. This is especially problematic because datasets used for training large LMs usually contain significant toxicity, hate, bias, and negativity. We propose GeDi as an efficient method for using smaller LMs as generative discriminators to guide generation from large LMs to make them safer and more controllable. GeDi guides generation at each step by computing classification probabilities for all possible next tokens via Bayes rule by normalizing over two class-conditional distributions; one conditioned on the desired attribute, or control code, and another conditioned on the undesired attribute, or anti control code. We find that GeDi gives controllability on par with or better than the state of the art method in a variety of settings, while also achieving generation speeds more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>30</mn></math></mjx-assistive-mml></mjx-container> times faster. Additionally, training GeDi on only three topics allows us to controllably generate new topics zero-shot from just a keyword. Lastly, we show that GeDi can make GPT-2 and GPT-3 significantly less toxic without sacrificing on linguistic fluency, making it by far the most practical existing method for detoxifying large language models while maintaining a fast generation speed.",
+    "title": "GeDi: Generative Discriminator Guided Sequence Generation",
+    "authors": [
+      "Ben Krause",
+      "Akhilesh Deepak Gotmare",
+      "Bryan McCann",
+      "Nitish Shirish Keskar",
+      "Shafiq Joty",
+      "richard socher",
+      "Nazneen Rajani"
+    ],
+    "emails": [
+      "~Ben_Krause1",
+      "~Nazneen_Rajani1",
+      "~Bryan_McCann1",
+      "~Shafiq_Joty1",
+      "~Akhilesh_Deepak_Gotmare1",
+      "~richard_socher1",
+      "~Nitish_Shirish_Keskar1"
+    ],
+    "rank": 1669
+  },
+  {
+    "url": "https://openreview.net/forum?id=gQn5xeVtz0I",
+    "ratings": [
+      7,
+      6,
+      4,
+      4
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Recent work (Zhu, Liu, and Han 2019) has shown that it is possible to reconstruct the input (image) from the gradient of a neural network. In this paper, our aim is to better understand the limits to reconstruction and to speed up image reconstruction by imposing prior image information and improved initialization. Exploring the theoretical limits of input reconstruction, we show that a fully-connected neural network with a single hidden node is enough to reconstruct a single input image, regardless of the number of nodes in the output layer. Then we generalize this result to a gradient averaged over mini-batches of size B. In this case, the full mini-batch can be reconstructed in a fully-connected network if the number of hidden units exceeds B. For a convolutional neural network, the required number of filters in the first convolutional layer again is decided by the batch size B, however, in this case, input width d and the width after filter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>d</mi><mrow><msup><mi></mi><mo>\u2032</mo></msup></mrow></msup></math></mjx-assistive-mml></mjx-container> also play the role <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-msup size=\"s\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.289em;\"><mjx-texatom texclass=\"ORD\" style=\"font-size: 77.7%;\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2032\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-texatom></mjx-script></mjx-msup></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>h</mi><mo>=</mo><mo stretchy=\"false\">(</mo><mfrac><mi>d</mi><msup><mi>d</mi><mrow><msup><mi></mi><mo>\u2032</mo></msup></mrow></msup></mfrac><msup><mo stretchy=\"false\">)</mo><mn>2</mn></msup><mi>B</mi><mi>C</mi></math></mjx-assistive-mml></mjx-container>, where C is channel number of input. Finally, we validate and underpin our theoretical analysis on bio-medical data (fMRI, ECG signals, and cell images) and on benchmark data (MNIST, CIFAR100, and face images).",
+    "title": "What can we learn from gradients?",
+    "authors": [
+      "Jia Qian",
+      "Lars Kai Hansen"
+    ],
+    "emails": [
+      "~Jia_Qian1",
+      "~Lars_Kai_Hansen1"
+    ],
+    "rank": 1670
+  },
+  {
+    "url": "https://openreview.net/forum?id=MwxaStJXK6v",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Double Q-learning \\citep{hasselt2010double} has gained significant success in practice due to its effectiveness in overcoming the overestimation issue of Q-learning. However, theoretical understanding of double Q-learning is rather limited and the only existing finite-time analysis was recently established in \\citet{xiong2020double} under a polynomial learning rate. This paper analyzes the more challenging case with a rescaled linear/constant learning rate for which the previous method does not appear to be applicable. We develop new analytical tools that achieve an order-level better finite-time convergence rate than the previously established result. Specifically, we show that synchronous double Q-learning attains an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-accurate global optimum with a time complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi><mjx-mrow><mjx-mo class=\"mjx-lop\"><mjx-c class=\"mjx-c28 TEX-S2\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mrow size=\"s\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi></mjx-mrow></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mrow size=\"s\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-mrow></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-lop\"><mjx-c class=\"mjx-c29 TEX-S2\"></mjx-c></mjx-mo></mjx-mrow></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a9</mi><mrow data-mjx-texclass=\"INNER\"><mo data-mjx-texclass=\"OPEN\">(</mo><mfrac><mrow><mi>ln</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>D</mi></mrow><mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mo>\u2212</mo><mi>\u03b3</mi><msup><mo stretchy=\"false\">)</mo><mn>7</mn></msup><msup><mi>\u03f5</mi><mn>2</mn></msup></mrow></mfrac><mo data-mjx-texclass=\"CLOSE\">)</mo></mrow></math></mjx-assistive-mml></mjx-container>, and the asynchronous algorithm attains a time complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.111em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A9\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mrow><mjx-mo class=\"mjx-lop\"><mjx-c class=\"mjx-c28 TEX-S2\"></mjx-c></mjx-mo><mjx-mfrac><mjx-frac><mjx-num><mjx-nstrut></mjx-nstrut><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-num><mjx-dbox><mjx-dtable><mjx-line></mjx-line><mjx-row><mjx-den><mjx-dstrut></mjx-dstrut><mjx-mrow size=\"s\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.289em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-mrow></mjx-den></mjx-row></mjx-dtable></mjx-dbox></mjx-frac></mjx-mfrac><mjx-mo class=\"mjx-lop\"><mjx-c class=\"mjx-c29 TEX-S2\"></mjx-c></mjx-mo></mjx-mrow></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi mathvariant=\"normal\">\u03a9</mi><mo stretchy=\"false\">~</mo></mover></mrow><mrow data-mjx-texclass=\"INNER\"><mo data-mjx-texclass=\"OPEN\">(</mo><mfrac><mi>L</mi><mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mo>\u2212</mo><mi>\u03b3</mi><msup><mo stretchy=\"false\">)</mo><mn>7</mn></msup><msup><mi>\u03f5</mi><mn>2</mn></msup></mrow></mfrac><mo data-mjx-texclass=\"CLOSE\">)</mo></mrow></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>D</mi></math></mjx-assistive-mml></mjx-container> is the cardinality of the state-action space, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b3</mi></math></mjx-assistive-mml></mjx-container> is the discount factor, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> is a parameter related to the sampling strategy for asynchronous double Q-learning. These results improve the order-level dependence of the convergence rate on all major parameters <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mi>\u03f5</mi><mo>,</mo><mn>1</mn><mo>\u2212</mo><mi>\u03b3</mi><mo>,</mo><mi>D</mi><mo>,</mo><mi>L</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> provided in \\citet{xiong2020double}. The new analysis in this paper presents a more direct and succinct approach for characterizing the finite-time convergence rate of double Q-learning.",
+    "title": "Double Q-learning: New Analysis and Sharper Finite-time Bound",
+    "authors": [
+      "Lin Zhao",
+      "Huaqing Xiong",
+      "Yingbin Liang",
+      "Wei Zhang"
+    ],
+    "emails": [
+      "~Yingbin_Liang1",
+      "~Wei_Zhang40",
+      "~Huaqing_Xiong1",
+      "~Lin_Zhao3"
+    ],
+    "rank": 1671
+  },
+  {
+    "url": "https://openreview.net/forum?id=LhAqAxwH5cn",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      5,
+      3
+    ],
+    "rating": "5.15",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In ordinary-label learning, the correct label is given to each training sample. Similarly, a complementary label is also provided for each training sample in complementary-label learning. A complementary label indicates a class that the example does not belong to. Robust learning of classifiers has been investigated from many viewpoints under label noise, but little attention has been paid to complementary-label learning. In this paper, we present a new algorithm of complementary-label learning with the robustness of loss function. We also provide two sufficient conditions on a loss function so that the minimizer of the risk for complementary labels is theoretically guaranteed to be consistent with the minimizer of the risk for ordinary labels. Finally, the empirical results validate our method\u2019s superiority to current state-of-the-art techniques. Especially in cifar10, our algorithm achieves a much higher test accuracy than the gradient ascent algorithm, and the parameters of our model are less than half of the ResNet-34 they used.",
+    "title": "Robust Loss Functions for Complementary Labels Learning",
+    "authors": [
+      "Defu Liu",
+      "Guowu Yang"
+    ],
+    "emails": [
+      "~Defu_Liu1",
+      "uestc.edu.cn"
+    ],
+    "rank": 1672
+  },
+  {
+    "url": "https://openreview.net/forum?id=xrLrpG3Ep1X",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.15",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Domain generalization is an approach that utilizes several source domains to train the learner to be generalizable to unseen target domain to tackle domain shift issue. It has drawn much attention in machine learning community. This paper aims to learn to generalize well to unseen target domain without relying on the knowledge of the number of source domains and domain labels. To achieve that goal, we unify adversarial training and meta-learning in a novel proposed Domain-Free Adversarial Splitting (DFAS) framework. In this framework, we model the domain generalization as a learning problem that enforces the learner to be able to generalize well for any train/val subsets splitting of the training dataset. This model can be further transformed to be a min-max optimization problem which can be solved by an iterative adversarial training process. In each iteration, it adversarially splits the training dataset into train/val subsets to maximize domain shift between them using current learner, and then updates the learner on this splitting to be able to generalize well from train-subset to val-subset using meta-learning approach. Extensive experiments on three benchmark datasets under three different settings on the source and target domains show that our method achieves state-of-the-art results and confirm the effectiveness of our method by ablation study. We also derive a generalization error bound for theoretical understanding of our method.",
+    "title": "Domain-Free Adversarial Splitting for Domain Generalization",
+    "authors": [
+      "Xiang Gu",
+      "Jiasun Feng",
+      "Jian Sun",
+      "Zongben Xu"
+    ],
+    "emails": [
+      "stu.xjtu.edu.cn",
+      "~Zongben_Xu1",
+      "~Xiang_Gu1",
+      "~Jian_Sun1"
+    ],
+    "rank": 1673
+  },
+  {
+    "url": "https://openreview.net/forum?id=9vCLOXwprc",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.15",
+    "confidences": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We present Iterated Graph Neural Network System (IGNNS), a new framework of Graph Neural Networks (GNNs), which can deal with undirected graph and directed graph in a unified way. The core component of IGNNS is the Iterated Function System (IFS), which is an important research field in fractal geometry. The key idea of IGNNS is to use a pair of affine transformations to characterize the process of message passing between graph nodes and assign an adjoint probability vector to them to form an IFS layer with probability. After embedding in the latent space, the node features are sent to IFS layer for iterating, and then obtain the high-level representation of graph nodes. We also analyze the geometric properties of IGNNS from the perspective of dynamical system. We prove that if the IFS induced by IGNNS is contractive, then the fractal representation of graph nodes converges to the fractal set of IFS in Hausdorff distance and the ergodic representation of that converges to a constant matrix in Frobenius norm. We have carried out a series of semi supervised node classification experiments on citation network datasets such as citeser, Cora and PubMed. The experimental results show that the performance of our method is obviously better than the related methods.",
+    "title": "Iterated graph neural network system",
+    "authors": [
+      "Hanju Li"
+    ],
+    "emails": [
+      "~Hanju_Li1"
+    ],
+    "rank": 1674
+  },
+  {
+    "url": "https://openreview.net/forum?id=v5WXtSXsVCJ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Word embeddings have gained increasing popularity in the recent years due to the Word2vec library and its extension fastText that uses subword information. In this paper, we aim at improving the execution speed of fastText training on homogeneous multi- and manycore CPUs while maintaining accuracy. We present a novel open-source implementation that flexibly incorporates various algorithmic variants including negative sample sharing, batched updates, and a byte-pair encoding-based alternative for subword units. We build these novel variants over a fastText implementation that we carefully optimized for the architecture, memory hierarchy, and parallelism of current manycore CPUs. Our experiments on three languages demonstrate 3-20x speed-up in training time at competitive semantic and syntactic accuracy.",
+    "title": "Faster Training of Word Embeddings",
+    "authors": [
+      "Eliza Wszola",
+      "Martin Jaggi",
+      "Markus P\u00fcschel"
+    ],
+    "emails": [
+      "~Eliza_Wszola1",
+      "~Markus_P\u00fcschel1",
+      "~Martin_Jaggi1"
+    ],
+    "rank": 1675
+  },
+  {
+    "url": "https://openreview.net/forum?id=trj4iYJpIvy",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      7
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Principal component analysis (PCA) is a widely used dimension reduction technique in machine learning and multivariate statistics. To improve the interpretability of PCA, various approaches to obtain sparse principal direction loadings have been proposed, which are termed Sparse Principal Component Analysis (SPCA). In this paper, we present three provably accurate, polynomial time, approximation algorithms for the SPCA problem, without imposing any restrictive assumptions on the input covariance matrix. The first algorithm is based on randomized matrix multiplication; the second algorithm is based on a novel deterministic thresholding scheme; and the third algorithm is based on a semidefinite programming relaxation of SPCA. All algorithms come with provable guarantees and run in low-degree polynomial time. Our empirical evaluations confirm our theoretical findings.",
+    "title": "Approximation Algorithms for Sparse Principal Component Analysis",
+    "authors": [
+      "Agniva Chowdhury",
+      "Petros Drineas",
+      "David Woodruff",
+      "Samson Zhou"
+    ],
+    "emails": [
+      "~Agniva_Chowdhury1",
+      "~David_Woodruff2",
+      "~Petros_Drineas1",
+      "~Samson_Zhou1"
+    ],
+    "rank": 1676
+  },
+  {
+    "url": "https://openreview.net/forum?id=oGzm2X0aek",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.15",
+    "confidences": [
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "The training process of RL requires many trial-and-errors that are costly in real-world applications. To avoid the cost, a promising solution is to learn the policy from an offline dataset, e.g., to learn a simulator from the dataset, and train optimal policies in the simulator. By this approach, the quality of policies highly relies on the fidelity of the simulator. Unfortunately, due to the stochasticity and unsteadiness of the real-world and the unavailability of online sampling, the distortion of the simulator is inevitable. In this paper, based on the model learning technique, we propose a new paradigm to learn an RL policy from offline data in the real-world sequential recommendation system (SRS). Instead of increasing the fidelity of models for policy learning, we handle the distortion issue via learning to adapt to diverse simulators generated by the offline dataset. The adaptive policy is suitable to real-world environments where dynamics are changing and have stochasticity in the offline setting. Experiments are conducted in synthetic environments and a real-world ride-hailing platform. The results show that the method overcomes the distortion problem and produces robust recommendations in the unseen real-world.",
+    "title": "Offline Adaptive Policy Leaning in Real-World Sequential Recommendation Systems",
+    "authors": [
+      "Xiong-Hui Chen",
+      "Yang Yu",
+      "Qingyang Li",
+      "Zhiwei Tony Qin",
+      "Wenjie Shang",
+      "Yiping Meng",
+      "Jieping Ye"
+    ],
+    "emails": [
+      "didiglobal.com",
+      "~Jieping_Ye3",
+      "~Yang_Yu5",
+      "~Yiping_Meng1",
+      "~Xiong-Hui_Chen1",
+      "~Qingyang_Li4",
+      "~Zhiwei_Tony_Qin1"
+    ],
+    "rank": 1677
+  },
+  {
+    "url": "https://openreview.net/forum?id=E3UZoJKHxuk",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.14",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Current supervised learning can learn spurious correlation during the data-fitting process, imposing issues regarding interpretability, out-of-distribution (OOD) generalization, and robustness. To avoid spurious correlation, we propose a \\textbf{La}tent \\textbf{C}ausal \\textbf{I}nvariance \\textbf{M}odel (LaCIM) which pursues \\emph{causal prediction}. Specifically, we introduce latent variables that are separated into (a) output-causative factors and (b) others that are spuriously correlated to the output via confounders, to model the underlying causal factors. We further assume the generating mechanisms from latent space to observed data to be \\emph{causally invariant}. We give the identifiable claim of such invariance, particularly the disentanglement of output-causative factors from others, as a theoretical guarantee for precise inference and avoiding spurious correlation. We propose a Variational-Bayesian-based method for estimation and to optimize over the latent space for prediction. The utility of our approach is verified by improved interpretability, prediction power on various OOD scenarios (including healthcare) and robustness on security. ",
+    "title": "Latent Causal Invariant Model",
+    "authors": [
+      "Xinwei Sun",
+      "Botong Wu",
+      "Chang Liu",
+      "Xiangyu Zheng",
+      "Wei Chen",
+      "Tao Qin",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Xiangyu_Zheng1",
+      "~Tao_Qin1",
+      "~Tie-Yan_Liu1",
+      "~Botong_Wu1",
+      "~Chang_Liu10",
+      "~Wei_Chen1",
+      "~Xinwei_Sun1"
+    ],
+    "rank": 1678
+  },
+  {
+    "url": "https://openreview.net/forum?id=rx19UMFbC9u",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      5
+    ],
+    "rating": "5.14",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Network pruning has been widely adopted for reducing computational cost and memory consumption in low-resource devices. Recent studies show that saliency-based pruning can achieve high compression ratios (e.g., 80-90% of the parameters in original networks are removed) without sacrificing much accuracy loss. Nevertheless, finding the well-trainable networks with sparse parameters (e.g., &lt; 10% of the parameters remaining) is still challenging to network pruning, commonly believed to lack model capacity. In this work, we revisit the procedure of existing pruning methods and observe that dead connections, which do not contribute to model capacity, appear regardless of pruning methods. To this end, we propose a novel pruning method, called all-alive pruning (AAP), producing the pruned networks with only trainable weights. Notably, AAP is broadly applicable to various saliency-based pruning methods and model architectures. We demonstrate that AAP equipped with existing pruning methods (i.e., iterative pruning, one-shot pruning, and dynamic pruning) consistently improves the accuracy of original methods at 128\u00d7 - 4096\u00d7 compression ratios on three benchmark datasets.",
+    "title": "Waste not, Want not: All-Alive Pruning for Extremely Sparse Networks",
+    "authors": [
+      "Daejin Kim",
+      "Hyunjung Shim",
+      "Jongwuk Lee"
+    ],
+    "emails": [
+      "~Jongwuk_Lee1",
+      "~Daejin_Kim1",
+      "~Hyunjung_Shim1"
+    ],
+    "rank": 1679
+  },
+  {
+    "url": "https://openreview.net/forum?id=FKotzp6PZJw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      6
+    ],
+    "rating": "5.14",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Double Q-learning is a classical method for reducing overestimation bias, which is caused by taking maximum estimated values in the Bellman operator. Its variants in the deep Q-learning paradigm have shown great promise in producing reliable value prediction and improving learning performance. However, as shown by prior work, double Q-learning is not fully unbiased and still suffers from underestimation bias. In this paper, we show that such underestimation bias may lead to multiple non-optimal fixed points under an approximated Bellman operation. To address the concerns of converging to non-optimal stationary solutions, we propose a simple and effective approach as a partial fix for underestimation bias in double Q-learning. This approach leverages real returns to bound the target value. We extensively evaluate the proposed method in the Atari benchmark tasks and demonstrate its significant improvement over baseline algorithms.",
+    "title": "On the Estimation Bias in Double Q-Learning",
+    "authors": [
+      "Zhizhou Ren",
+      "Guangxiang Zhu",
+      "Beining Han",
+      "Jianglun Chen",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "~Guangxiang_Zhu1",
+      "~Jianglun_Chen2",
+      "~Beining_Han1",
+      "~Chongjie_Zhang1",
+      "~Zhizhou_Ren1"
+    ],
+    "rank": 1680
+  },
+  {
+    "url": "https://openreview.net/forum?id=ol_xwLR2uWD",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      9,
+      3,
+      4
+    ],
+    "rating": "5.14",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The pressing need for pretraining algorithms has been diminished by numerous advances in terms of regularization, architectures, and optimizers. Despite this trend, we re-visit the classic idea of unsupervised autoencoder pretraining and propose a modified variant that relies on a full reverse pass trained in conjunction with a given training task. We establish links between SVD and pretraining and show how it can be leveraged for gaining insights about the learned structures. Most importantly, we demonstrate that our approach yields an improved performance for a wide variety of relevant learning and transfer tasks ranging from fully connected networks over ResNets to GANs. Our results demonstrate that unsupervised pretraining has not lost its practical relevance in today's deep learning environment.",
+    "title": "Reviving Autoencoder Pretraining",
+    "authors": [
+      "You Xie",
+      "Nils Thuerey"
+    ],
+    "emails": [
+      "~You_Xie1",
+      "~Nils_Thuerey1"
+    ],
+    "rank": 1681
+  },
+  {
+    "url": "https://openreview.net/forum?id=gSJTgko59MC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.14",
+    "confidences": [
+      5,
+      3,
+      4,
+      2
+    ],
+    "abstract": "A recent line of works studied wide deep neural networks (DNNs) by approximating them as Gaussian Processes (GPs). A DNN trained with gradient flow was shown to map to a GP governed by the Neural Tangent Kernel (NTK), whereas earlier works showed that a DNN with an i.i.d. prior over its parameters maps to the so-called Neural Network Gaussian Process (NNGP). Here we consider a DNN training protocol, involving noise, weight decay and finite width, whose outcome corresponds to a certain non-Gaussian stochastic process. An analytical framework is then introduced to analyze this non-Gaussian process, whose deviation from a GP is controlled by the finite width. Our contribution is three-fold:\n(i) In the infinite width limit, we establish a correspondence between DNNs trained with noisy gradients and the NNGP, not the NTK. \n(ii) We provide a general analytical form for the finite width correction (FWC) for DNNs with arbitrary activation functions and depth and use it to predict the outputs of empirical finite networks with high accuracy. \nAnalyzing the FWC behavior as a function of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>, the training set size, we find that it is negligible for both the very small <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> regime, and, surprisingly, for the large <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> regime (where the GP error scales as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>).\n(iii) We flesh-out algebraically how these FWCs can improve the performance of finite convolutional neural networks (CNNs) relative to their GP counterparts on image classification tasks.",
+    "title": "Predicting the Outputs of Finite Networks Trained with Noisy Gradients",
+    "authors": [
+      "Gadi Naveh",
+      "Oded Ben-David",
+      "Haim Sompolinsky",
+      "Zohar Ringel"
+    ],
+    "emails": [
+      "~Zohar_Ringel1",
+      "~Haim_Sompolinsky1",
+      "~Gadi_Naveh1",
+      "mail.huji.ac.il"
+    ],
+    "rank": 1682
+  },
+  {
+    "url": "https://openreview.net/forum?id=tq5JAGsedIP",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.14",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Representation learning models for graphs are a successful family of techniques that project nodes into feature spaces that can be exploited by other machine learning algorithms. Since many real-world networks are inherently dynamic, with interactions among nodes changing over time, these techniques can be defined both for static and for time-varying graphs. Here, we show how the skip-gram embedding approach can be used to perform implicit tensor factorization on different tensor representations of time-varying graphs. We show that higher-order skip-gram with negative sampling (HOSGNS) is able to disentangle the role of nodes and time, with a small fraction of the number of parameters needed by other approaches. We empirically evaluate our approach using time-resolved face-to-face proximity data, showing that the learned representations outperform state-of-the-art methods when used to solve downstream tasks such as network reconstruction. Good performance on predicting the outcome of dynamical processes such as disease spreading shows the potential of this new method to estimate contagion risk, providing early risk awareness based on contact tracing data.",
+    "title": "Time-varying Graph Representation Learning via Higher-Order Skip-Gram with Negative Sampling",
+    "authors": [
+      "Simone Piaggesi",
+      "Andr\u00e9 Panisson"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Simone_Piaggesi1"
+    ],
+    "rank": 1683
+  },
+  {
+    "url": "https://openreview.net/forum?id=BIwkgTsSp_8",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      6
+    ],
+    "rating": "5.14",
+    "confidences": [
+      5,
+      4,
+      2,
+      3
+    ],
+    "abstract": "In recent years, the collection and sharing of individuals\u2019 private data has become commonplace in many industries. Local differential privacy (LDP) is a rigorous approach which uses a randomized algorithm to preserve privacy even from the database administrator, unlike the more standard central differential privacy. For LDP, when applying noise directly to high-dimensional data, the level of noise required all but entirely destroys data utility. In this paper we introduce a novel, application-agnostic privatization mechanism that leverages representation learning to overcome the prohibitive noise requirements of direct methods, while maintaining the strict guarantees of LDP. We further demonstrate that data privatized with this mechanism can be used to train machine learning algorithms. Applications of this model include private data collection, private novel-class classification, and the augmentation of clean datasets with additional privatized features. We achieve significant gains in performance on downstream classification tasks relative to benchmarks that noise the data directly, which are state-of-the-art in the context of application-agnostic LDP mechanisms for high-dimensional data sharing tasks.",
+    "title": "Learning to Noise: Application-Agnostic Data Sharing with Local Differential Privacy",
+    "authors": [
+      "Alex Mansbridge",
+      "Gregory Barbour",
+      "Davide Piras",
+      "Christopher Frye",
+      "Ilya Feige",
+      "David Barber"
+    ],
+    "emails": [
+      "~Alex_Mansbridge1",
+      "~Christopher_Frye1",
+      "~Davide_Piras1",
+      "~Ilya_Feige1",
+      "~Gregory_Barbour1",
+      "~David_Barber2"
+    ],
+    "rank": 1684
+  },
+  {
+    "url": "https://openreview.net/forum?id=7IElVSrNm54",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.13",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "In a statistical notion of algorithmic fairness, we partition individuals into groups based on some key demographic factors such as race and gender, and require that some statistics of a classifier be approximately equalized across those groups. Current approaches require complete annotations for demographic factors, or focus on an abstract worst-off group rather than demographic groups. In this paper, we consider the setting where the demographic factors are only partially available. For example, we have training examples for white-skinned and dark-skinned males, and white-skinned females, but we have zero examples for dark-skinned females. We could also have zero examples for females regardless of their skin colors. Without additional knowledge, it is impossible to directly control the discrepancy of the classifier's statistics for those invisible groups. We develop a disentanglement algorithm that splits a representation of data into a component that captures the demographic factors and another component that is invariant to them based on a context dataset. The context dataset is much like the deployment dataset, it is unlabeled but it contains individuals from all demographics including the invisible. We cluster the context set, equalize the cluster size to form a \"perfect batch\", and use it as a supervision signal for the disentanglement. We propose a new discriminator loss based on a learnable attention mechanism to distinguish a perfect batch from a non-perfect one. We evaluate our approach on standard classification benchmarks and show that it is indeed possible to protect invisible demographics.",
+    "title": "Zero-shot Fairness with Invisible Demographics",
+    "authors": [
+      "Thomas Kehrenberg",
+      "Viktoriia Sharmanska",
+      "Myles Scott Bartlett",
+      "Novi Quadrianto"
+    ],
+    "emails": [
+      "~Viktoriia_Sharmanska1",
+      "~Myles_Scott_Bartlett1",
+      "~Thomas_Kehrenberg1",
+      "~Novi_Quadrianto1"
+    ],
+    "rank": 1685
+  },
+  {
+    "url": "https://openreview.net/forum?id=zq4bt_0z-gz",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      3
+    ],
+    "rating": "5.13",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In many sequence learning tasks, such as program synthesis and document summarization, a key problem is searching over a large space of possible output sequences. We propose to learn representations of the outputs that is specifically meant for search: rich enough to specify the desired output but compact enough to make search more efficient. An appealing realization of such representation are discrete latent codes, as this naturally allows sophisticated combinatorial search strategies. The latent codes are learned using a self-supervised learning principle, in which first a discrete autoencoder is trained on the output sequences, and then the resulting latent codes are used as intermediate targets for the end-to-end sequence prediction task. Based on these insights, we introduce the Latent Programmer, a program synthesis method that first predicts a discrete latent codes from input/output examples, and then generates the program in the target language. We  evaluate the Latent Programmer on two domains: synthesis of string transformation programs, and generation of programs from natural language descriptions. We demonstrate that the discrete latent representation significantly improves synthesis accuracy.",
+    "title": "Latent Programmer: Discrete Latent Codes for Program Synthesis",
+    "authors": [
+      "Joey Hong",
+      "David Dohan",
+      "Rishabh Singh",
+      "Charles Sutton",
+      "Manzil Zaheer"
+    ],
+    "emails": [
+      "~David_Dohan1",
+      "~Charles_Sutton1",
+      "~Rishabh_Singh1",
+      "~Manzil_Zaheer1",
+      "~Joey_Hong2"
+    ],
+    "rank": 1686
+  },
+  {
+    "url": "https://openreview.net/forum?id=FTit3PiAw4",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.13",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recently, Generative Adversarial Networks (GANs) have demonstrated their potential in federated learning, i.e., learning a centralized model from data privately hosted by multiple sites. A federated GAN jointly trains a centralized generator and multiple private discriminators hosted at different sites. A major theoretical challenge for the federated GAN is the heterogeneity of the local data distributions. Traditional approaches cannot guarantee to learn the target distribution, which is a mixture of the highly different local distributions. This paper tackles this theoretical challenge, and for the first time, provides a provably correct framework for federated GAN. We propose a new approach called Universal Aggregation, which simulates a centralized discriminator via carefully aggregating the mixture of all private discriminators. We prove that a generator trained with this simulated centralized discriminator can learn the desired target distribution. Through synthetic and real datasets, we show that our method can learn the mixture of largely different distributions, when existing federated GAN methods fail to.",
+    "title": "Training Federated GANs  with Theoretical Guarantees: A Universal Aggregation Approach",
+    "authors": [
+      "Yikai Zhang",
+      "Hui Qu",
+      "Huidong Liu",
+      "Qi Chang",
+      "Dimitris N. Metaxas",
+      "Chao Chen"
+    ],
+    "emails": [
+      "~Qi_Chang1",
+      "~Chao_Chen1",
+      "~Dimitris_N._Metaxas1",
+      "~Yikai_Zhang1",
+      "~Huidong_Liu1",
+      "~Hui_Qu1"
+    ],
+    "rank": 1687
+  },
+  {
+    "url": "https://openreview.net/forum?id=kOA6rtPxyL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      5
+    ],
+    "rating": "5.13",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Gradient-based meta-learning relates task-specific models to a meta-model by gradients. By this design, an algorithm first optimizes the task-specific models by an inner loop and then backpropagates meta-gradients through the loop to update the meta-model. The number of inner-loop optimization steps has to be small (e.g., one step) to avoid high-order derivatives, big memory footprints, and the risk of vanishing or exploding meta-gradients. We propose an intuitive teacher-student scheme to enable the gradient-based meta-learning algorithms to explore long horizons by the inner loop. The key idea is to employ a student network to explore the search space of task-specific models adequately (e.g., by more than ten steps), and a teacher then takes a ``leap'' toward the regions probed by the student. The teacher not only arrives at a high-quality model but also defines a lightweight computation graph for meta-gradients. Our approach is generic, as we verify its effectiveness with four meta-learning algorithms over three tasks: few-shot learning, long-tailed classification, and meta-attack.",
+    "title": "A Lazy Approach to Long-Horizon Gradient-Based Meta-Learning",
+    "authors": [
+      "Muhammad Abdullah Jamal",
+      "Liqiang Wang",
+      "Boqing Gong"
+    ],
+    "emails": [
+      "~Liqiang_Wang1",
+      "~Muhammad_Abdullah_Jamal2",
+      "~Boqing_Gong1"
+    ],
+    "rank": 1688
+  },
+  {
+    "url": "https://openreview.net/forum?id=F9sPTWSKznC",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.13",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Despite increasing instances of machine translation (MT) systems including extrasentential context information, the evidence for translation quality improvement is sparse, especially for discourse phenomena. Popular metrics like BLEU are not expressive or sensitive enough to capture quality improvements or drops that are minor in size but significant in perception. We introduce the first of their kind MT benchmark testsets that aim to track and hail improvements across four main discourse phenomena: anaphora, lexical consistency, coherence and readability, and discourse connective translation. We also introduce evaluation methods for these tasks, and evaluate several competitive baseline MT systems on the curated datasets. Surprisingly, we find that the complex context-aware models that we test do not improve discourse-related translations consistently across languages and phenomena. Our evaluation benchmark is available as a leaderboard at &lt;dipbenchmark1.github.io&gt;. ",
+    "title": "DiP Benchmark Tests: Evaluation Benchmarks for Discourse Phenomena in MT",
+    "authors": [
+      "Prathyusha Jwalapuram",
+      "Barbara Rychalska",
+      "Shafiq Joty",
+      "Dominika Basaj"
+    ],
+    "emails": [
+      "~Dominika_Basaj1",
+      "~Prathyusha_Jwalapuram1",
+      "~Shafiq_Joty1",
+      "~Barbara_Rychalska1"
+    ],
+    "rank": 1689
+  },
+  {
+    "url": "https://openreview.net/forum?id=KCzRX9N8BIH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.13",
+    "confidences": [
+      4,
+      5,
+      2,
+      4
+    ],
+    "abstract": "Many common loss functions such as mean-squared-error, cross-entropy, and reconstruction loss are unnecessarily rigid. Under a probabilistic interpretation, these common losses correspond to distributions with fixed shapes and scales. We instead argue for optimizing full likelihoods that include parameters like the normal variance and softmax temperature. Joint optimization of these ``likelihood parameters'' with model parameters can adaptively tune the scales and shapes of losses in addition to the strength of regularization. We explore and systematically evaluate how to parameterize and apply likelihood parameters for robust modeling, outlier-detection, and re-calibration. Additionally, we propose adaptively tuning <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> weights by fitting the scale parameters of normal and Laplace priors and introduce more flexible element-wise regularizers.",
+    "title": "It Is Likely That Your Loss Should be a Likelihood",
+    "authors": [
+      "Mark Hamilton",
+      "Evan Shelhamer",
+      "William T. Freeman"
+    ],
+    "emails": [
+      "~William_T._Freeman1",
+      "~Evan_Shelhamer2",
+      "~Mark_Hamilton1"
+    ],
+    "rank": 1690
+  },
+  {
+    "url": "https://openreview.net/forum?id=3eNrIs9I78x",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.13",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In an effort to improve generalization in deep learning, we propose SALR: a sharpness-aware learning rate update technique designed to recover flat minimizers. Our method dynamically updates the learning rate of gradient-based optimizers based on the local sharpness of the loss function. This allows optimizers to automatically increase learning rates at sharp valleys to increase the chance of escaping them. We demonstrate the effectiveness of SALR when adopted by various algorithms over a broad range of networks. Our experiments indicate that SALR improves generalization, converges faster, and drives solutions to significantly flatter regions. ",
+    "title": "SALR: Sharpness-aware Learning Rates for Improved Generalization",
+    "authors": [
+      "Xubo Yue",
+      "Maher Nouiehed",
+      "Raed Al Kontar"
+    ],
+    "emails": [
+      "~Raed_Al_Kontar1",
+      "~Maher_Nouiehed1",
+      "~Xubo_Yue1"
+    ],
+    "rank": 1691
+  },
+  {
+    "url": "https://openreview.net/forum?id=NPab8GcO5Pw",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.12",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Network pruning, or sparse network has a long history and practical significance in modern applications. Although the loss functions of neural networks may yield bad landscape due to non-convexity, we focus on linear activation which already owes benign landscape. With no unrealistic assumption, we conclude the following statements for the squared loss objective of general sparse linear neural networks: 1) every local minimum is a global minimum for scalar output with any sparse structure, or non-intersected sparse first layer and dense other layers with orthogonal training data; 2) sparse linear networks have sub-optimal local-min for only sparse first layer due to low rank constraint, or output larger than three dimensions due to the global minimum of a sub-network. Overall, sparsity breaks the normal structure, cutting out the decreasing path in original fully-connected networks.",
+    "title": "On the Landscape of Sparse Linear Networks",
+    "authors": [
+      "Dachao Lin",
+      "Ruoyu Sun",
+      "Zhihua Zhang"
+    ],
+    "emails": [
+      "~Ruoyu_Sun1",
+      "~Zhihua_Zhang1",
+      "~Dachao_Lin1"
+    ],
+    "rank": 1692
+  },
+  {
+    "url": "https://openreview.net/forum?id=bJxgv5C3sYc",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.12",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Hyperparameter optimization (HPO) is a central pillar in the automation of machine learning solutions and is mainly performed via Bayesian optimization, where a parametric surrogate is learned to approximate the black box response function (e.g. validation error). Unfortunately, evaluating the response function is computationally intensive. As a remedy, earlier work emphasizes the need for transfer learning surrogates which learn to optimize hyperparameters for an algorithm from other tasks. In contrast to previous work, we propose to rethink HPO as a few-shot learning problem in which we train a shared deep surrogate model to quickly adapt (with few response evaluations) to the response function of a new task. We propose the use of a deep kernel network for a Gaussian process surrogate that is meta-learned in an end-to-end fashion in order to jointly approximate the response functions of a collection of training data sets. As a result, the novel few-shot optimization of our deep kernel surrogate leads to new state-of-the-art results at HPO compared to several recent methods on diverse metadata sets.",
+    "title": "Few-Shot Bayesian Optimization with Deep Kernel Surrogates",
+    "authors": [
+      "Martin Wistuba",
+      "Josif Grabocka"
+    ],
+    "emails": [
+      "~Josif_Grabocka1",
+      "~Martin_Wistuba1"
+    ],
+    "rank": 1693
+  },
+  {
+    "url": "https://openreview.net/forum?id=io-EI8C0q6A",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.12",
+    "confidences": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "abstract": "This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over masked latent speech representations and jointly learns a quantization of the latents shared across languages. The resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong individual models. Analysis shows that the latent discrete speech representations are shared across languages with increased sharing for related languages.",
+    "title": "Unsupervised Cross-lingual Representation Learning for Speech Recognition",
+    "authors": [
+      "Alexis Conneau",
+      "Alexei Baevski",
+      "Ronan Collobert",
+      "Abdelrahman Mohamed",
+      "Michael Auli"
+    ],
+    "emails": [
+      "~Michael_Auli1",
+      "~Alexis_Conneau1",
+      "~Ronan_Collobert1",
+      "~Alexei_Baevski1",
+      "~Abdelrahman_Mohamed2"
+    ],
+    "rank": 1694
+  },
+  {
+    "url": "https://openreview.net/forum?id=w_haMPbUgWb",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      4,
+      4
+    ],
+    "rating": "5.12",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Encoder-decoder architecture has been widely used in neural machine translation (NMT). A few methods have been proposed to improve it with multiple passes of decoding. However, their full potential is limited by a lack of appropriate termination policy. To address this issue, we present a novel framework, Rewriter-Evaluator. It consists of a rewriter and an evaluator. Translating a source sentence involves multiple passes. At every pass, the rewriter produces a new translation to improve the past translation and the evaluator estimates the translation quality to decide whether to terminate the rewriting process. We also propose a prioritized gradient descent (PGD) method that facilitates training the rewriter and the evaluator jointly. Though incurring multiple passes of decoding, Rewriter-Evaluator with the proposed PGD method can be trained with similar time to that of training encoder-decoder models. We apply the proposed framework to improve the general NMT models (e.g., Transformer). We conduct extensive experiments on two translation tasks, Chinese-English and English-German, and show that the proposed framework notably improves the performances of NMT models and significantly outperforms previous baselines.",
+    "title": "Rewriter-Evaluator Framework for Neural Machine Translation",
+    "authors": [
+      "Yangming Li",
+      "Kaisheng Yao"
+    ],
+    "emails": [
+      "~Kaisheng_Yao2",
+      "~Yangming_Li1"
+    ],
+    "rank": 1695
+  },
+  {
+    "url": "https://openreview.net/forum?id=VETJKjmyJyK",
+    "ratings": [
+      5,
+      5,
+      5,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.12",
+    "confidences": [
+      4,
+      5,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper presents a novel method for embedding transfer, a task of transferring knowledge of a learned embedding model to another. Our method exploits pairwise similarities between samples in the source embedding space as the knowledge, and transfers it through a loss function used for learning target embedding models. To this end, we design a new loss called smooth contrastive loss, which pulls together or pushes apart a pair of samples in a target embedding space with strength determined by their semantic similarity in the source embedding space; an analysis of the loss reveals that this property enables more important pairs to contribute more to learning the target embedding space. Experiments on metric learning benchmarks demonstrate that our method improves performance, or reduces sizes and embedding dimensions of target models effectively. Moreover, we show that deep networks trained in a self-supervised manner can be further enhanced by our method with no additional supervision. In all the experiments, our method clearly outperforms existing embedding transfer techniques.",
+    "title": "Embedding Transfer via Smooth Contrastive Loss",
+    "authors": [
+      "Sungyeon Kim",
+      "Dongwon Kim",
+      "Minsu Cho",
+      "Suha Kwak"
+    ],
+    "emails": [
+      "~Dongwon_Kim1",
+      "~Suha_Kwak3",
+      "~Minsu_Cho1",
+      "~Sungyeon_Kim1"
+    ],
+    "rank": 1696
+  },
+  {
+    "url": "https://openreview.net/forum?id=oev4KdikGjy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.12",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Mixed Sample Data Augmentation (MSDA) has received increasing attention in recent years, with many successful variants such as MixUp and CutMix. We analyse MSDA from an information theoretic perspective, characterising learned models in terms of how they impact the models\u2019 perception of the data.  Ultimately, our analyses allow us to decouple two complementary properties of augmentations that are useful for reasoning about MSDA. From insight on the efficacy of CutMix in particular, we subsequently propose FMix, an MSDA that uses binary masks obtained by applying a threshold to low frequency images sampled from Fourier space.  FMix improves performance over MixUp and CutMix for a number of models across a range of data sets and problem settings,  obtaining new state-of-the-art results on CIFAR-10 and Fashion-MNIST.",
+    "title": "FMix: Enhancing Mixed Sample Data Augmentation",
+    "authors": [
+      "Ethan Harris",
+      "Antonia Marcu",
+      "Matthew Painter",
+      "Mahesan Niranjan",
+      "Adam Prugel-Bennett",
+      "Jonathon Hare"
+    ],
+    "emails": [
+      "~Ethan_Harris1",
+      "~Jonathon_Hare1",
+      "~Adam_Prugel-Bennett1",
+      "~Mahesan_Niranjan1",
+      "ecs.soton.ac.uk"
+    ],
+    "rank": 1697
+  },
+  {
+    "url": "https://openreview.net/forum?id=ni_nys-C9D6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.12",
+    "confidences": [
+      3,
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Reverse-mode automatic differentiation (AD) suffers from the issue of having too much space overhead to trace back intermediate computational states for backpropagation.\nThe traditional method to trace back states is called checkpointing that stores intermediate states into a global stack and restore state through either stack pop or re-computing.\nThe overhead of stack manipulations and re-computing makes the general purposed (or not tensor-based) AD engines unable to meet many industrial needs.\nInstead of checkpointing, we propose to use reverse computing to trace back states by designing and implementing a reversible programming eDSL, where a program can be executed bi-directionally without implicit stack operations. The absence of implicit stack operations makes the program compatible with existing compiler features, including utilizing existing optimization passes and compiling the code as GPU kernels.\nWe implement AD for sparse matrix operations and some machine learning applications to show that our framework has state-of-the-art performance.",
+    "title": "Differentiate Everything with a Reversible Domain-Specific Language",
+    "authors": [
+      "JinGuo Liu",
+      "Taine Zhao"
+    ],
+    "emails": [
+      "~JinGuo_Liu1",
+      "logic.cs.tsukuba.ac.jp"
+    ],
+    "rank": 1698
+  },
+  {
+    "url": "https://openreview.net/forum?id=SVsLxTfHa1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.12",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Cross-lingual word embeddings (CLWE) have been proven useful in many cross-lingual tasks. However, most existing approaches to learn CLWE including the ones with contextual embeddings are sense agnostic. In this work, we propose a novel framework to align contextual embeddings at the sense level by leveraging cross-lingual signal from bilingual dictionaries only. We operationalize our framework by first proposing a novel sense-aware cross entropy loss to model word senses explicitly. The monolingual ELMo and BERT models pretrained with our sense-aware cross entropy loss demonstrate significant performance improvement for word sense disambiguation tasks. We then propose a sense alignment objective on top of the sense-aware cross entropy loss for cross-lingual model pretraining, and pretrain cross-lingual models for several language pairs (English to German/Spanish/Japanese/Chinese). Compared with the best baseline results, our cross-lingual models achieve 0.52%, 2.09% and 1.29% average performance improvements on zero-shot cross-lingual NER, sentiment classification and XNLI tasks, respectively. We will release our code.",
+    "title": "Towards Multi-Sense Cross-Lingual Alignment of Contextual Embeddings",
+    "authors": [
+      "Linlin Liu",
+      "Thien Hai Nguyen",
+      "Shafiq Joty",
+      "Lidong Bing",
+      "Luo Si"
+    ],
+    "emails": [
+      "~Luo_Si2",
+      "~Shafiq_Joty1",
+      "~Thien_Hai_Nguyen1",
+      "~Linlin_Liu2",
+      "~Lidong_Bing2"
+    ],
+    "rank": 1699
+  },
+  {
+    "url": "https://openreview.net/forum?id=2Id6XxTjz7c",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.12",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "As a practical model compression technique, parameter quantization is effective especially for language models associated with a large memory footprint. Neural network quantization is usually performed to reduce quantization loss assuming that quantization error of each parameter equally contributes to the overall training loss. The importance of each parameter, however, may highly differ such that for the same number of quantization bits, certain parameters lead to higher training loss than the others after quantization. In this paper, we consider a non-uniform quantization scheme, specifically binary-coding-based quantization, for high compression ratio and efficient computations while avoiding large accuracy degradation by uniform quantization (e.g., INT8). Then, we derive quantization optimization methods to take into account the importance of each parameter. We demonstrate that for post-training quantization, weight magnitude can represent importance and improve model accuracy significantly compared to the previous schemes lacking importance considerations. For various language models including BERT, DistilBERT, AWD-LSTM, and Transformer, we achieve 2-4 bits per weight by our proposed post-training quantization with reasonable accuracy degradation.",
+    "title": "Post-Training Weighted Quantization of Neural Networks for Language Models",
+    "authors": [
+      "Se Jung Kwon",
+      "Dongsoo Lee",
+      "Yongkweon Jeon",
+      "Byeongwook Kim",
+      "Bae Seong Park",
+      "Yeonju Ro"
+    ],
+    "emails": [
+      "~Yongkweon_Jeon1",
+      "~Byeongwook_Kim1",
+      "~Dongsoo_Lee1",
+      "~Se_Jung_Kwon1",
+      "~Yeonju_Ro1",
+      "~Bae_Seong_Park1"
+    ],
+    "rank": 1700
+  },
+  {
+    "url": "https://openreview.net/forum?id=q-qxdClTs0d",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      6,
+      4
+    ],
+    "rating": "5.12",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "This work considers the out-of-distribution (OOD) prediction problem where (1)~the training data are from multiple domains and (2)~the test domain is unseen in the training. DNNs fail in OOD prediction because they are prone to pick up spurious correlations. Recently, Invariant Risk Minimization (IRM) is proposed to address this issue. Its effectiveness has been demonstrated in the colored MNIST experiment. Nevertheless, we find that the performance of IRM can be dramatically degraded under \\emph{strong <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c39B\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u039b</mi></math></mjx-assistive-mml></mjx-container> spuriousness} -- when the spurious correlation between the spurious features and the class label is strong due to the strong causal influence of their common cause, the domain label, on both of them (see Fig. 1). In this work, we try to answer the questions: why does IRM fail in the aforementioned setting? Why does IRM work for the original colored MNIST dataset? Then, we propose a simple and effective approach to fix the problem of IRM. We combine IRM with conditional distribution matching to avoid a specific type of spurious correlation under strong <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c39B\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u039b</mi></math></mjx-assistive-mml></mjx-container> spuriousness. Empirically, we design a series of semi synthetic datasets -- the colored MNIST plus, which exposes the problems of IRM and demonstrates the efficacy of the proposed method.",
+    "title": "Out-of-distribution Prediction with Invariant Risk Minimization: The Limitation and An Effective Fix",
+    "authors": [
+      "Ruocheng Guo",
+      "Pengchuan Zhang",
+      "Hao Liu",
+      "Emre Kiciman"
+    ],
+    "emails": [
+      "~Hao_Liu2",
+      "~Ruocheng_Guo1",
+      "~Emre_Kiciman1",
+      "~Pengchuan_Zhang1"
+    ],
+    "rank": 1701
+  },
+  {
+    "url": "https://openreview.net/forum?id=xYJpCgSZff",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      6,
+      3
+    ],
+    "rating": "5.12",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Information Extraction (IE) aims to extract structured information from unstructured texts. However, in practice, the long-tailed and imbalanced data may lead to severe bias issues for deep learning models, due to very few training instances available for the tail classes. Existing works are mainly from computer vision society,  leveraging re-balancing, decoupling, transfer learning and causal inference to address this problem on image classification and scene graph generation. However, these approaches may not achieve good performance on textual data, which involves complex language structures that have been proven crucial for the IE tasks. To this end, we propose a novel framework (named CFIE) based on language structure and causal reasoning with three key ingredients. First, by fusing the syntax information to various structured causal models for mainstream IE tasks including relation extraction (RE), named entity recognition (NER), and event detection (ED), our approach is able to learn the direct effect for classification from an imbalanced dataset. Second, counterfactuals are generated based on an explicit language structure to better calculate the direct effect during the inference stage. Third, we propose a flexible debiasing approach for more robust prediction during the inference stage. Experimental results on three IE tasks across five public datasets show that our model significantly outperforms the state-of-the-art models by a large margin in terms of Mean Recall and Macro F1, achieving a relative 30% improvement in Mean Recall for 7 tail classes on the  ACE2005 dataset. We also discuss some interesting findings based on our observations.",
+    "title": "Counterfactual Thinking for Long-tailed Information Extraction",
+    "authors": [
+      "Guoshun Nan",
+      "Jiaqi Zeng",
+      "Rui Qiao",
+      "Wei Lu"
+    ],
+    "emails": [
+      "sutd.edu.sg",
+      "~Jiaqi_Zeng1",
+      "~Guoshun_Nan1"
+    ],
+    "rank": 1702
+  },
+  {
+    "url": "https://openreview.net/forum?id=fpJX0O5bWKJ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      4,
+      3
+    ],
+    "rating": "5.11",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In machine learning, a question of great interest is understanding what examples are challenging for a model to classify. Identifying atypical examples helps inform safe deployment of models, isolates examples that require further human inspection, and provides interpretability into model behavior. In this work, we propose the Variance of Gradients (VOG) as a valuable and efficient proxy metric for detecting outliers in the data distribution. We provide quantitative and qualitative support that VOG is a meaningful way to rank data by difficulty and to surface a tractable subset of the most challenging examples for human-in-the-loop auditing. Data points with high VOG scores are more difficult for the model to learn and over-index on examples that require memorization.",
+    "title": "Estimating Example Difficulty using Variance of Gradients",
+    "authors": [
+      "Chirag Agarwal",
+      "Sara Hooker"
+    ],
+    "emails": [
+      "~Chirag_Agarwal1",
+      "~Sara_Hooker1"
+    ],
+    "rank": 1703
+  },
+  {
+    "url": "https://openreview.net/forum?id=awnQ2qTLSwn",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      8,
+      4,
+      4
+    ],
+    "rating": "5.11",
+    "confidences": [
+      4,
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we study the problem of networked multi-agent reinforcement learning (MARL), where a number of agents are deployed as a partially connected network. Networked MARL requires all agents make decision in a decentralized manner to optimize a global objective with restricted communication between neighbors over the network. We propose a hierarchically decentralized MARL method, \\textit{LToS}, which enables agents to learn to dynamically share reward with neighbors so as to encourage agents to cooperate on the global objective. For each agent, the high-level policy learns how to share reward with neighbors to decompose the global objective, while the low-level policy learns to optimize local objective induced by the high-level policies in the neighborhood. The two policies form a bi-level optimization and learn alternately. We empirically demonstrate that LToS outperforms existing methods in both social dilemma and two networked MARL scenarios.",
+    "title": "Learning to Share in Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Yuxuan Yi",
+      "Ge Li",
+      "Yaowei Wang",
+      "Zongqing Lu"
+    ],
+    "emails": [
+      "~Yuxuan_Yi1",
+      "~Ge_Li2",
+      "~Yaowei_Wang1",
+      "~Zongqing_Lu2"
+    ],
+    "rank": 1704
+  },
+  {
+    "url": "https://openreview.net/forum?id=4NrO5vqdkwj",
+    "ratings": [
+      4,
+      5,
+      4,
+      8
+    ],
+    "rating": "5.11",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Streaming label learning aims to model newly emerged labels for multi-label classification systems, which requires plenty of new label data for training. However, in changing environments, only a small amount of new label data can practically be collected. In this work, we formulate and study few-shot streaming label learning (FSLL), which models emerging new labels with only a few annotated examples by utilizing the knowledge learned from past labels. We propose a meta-learning framework, Semantic Inference Network (SIN), which can learn and infer the semantic correlation between new labels and past labels to adapt FSLL tasks from a few examples effectively. SIN leverages label semantic representation to regularize the output space and acquires label-wise meta-knowledge based on the gradient-based meta-learning. Moreover, SIN incorporates a novel label decision module with a meta-threshold loss to find the optimal confidence thresholds for each new label. Theoretically, we demonstrate that the proposed semantic inference mechanism could constrain the complexity of hypotheses space to reduce the risk of overfitting and achieve better generalizability. Experimentally, extensive empirical results and ablation studies illustrate the superior performance of SIN over the prior state-of-the-art methods on FSLL.",
+    "title": "Semantic Inference Network for Few-shot Streaming Label Learning",
+    "authors": [
+      "Zhen Wang",
+      "Liu Liu",
+      "Yiqun Duan",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Yiqun_Duan1",
+      "~Dacheng_Tao1",
+      "~Liu_Liu8",
+      "~Zhen_Wang9"
+    ],
+    "rank": 1705
+  },
+  {
+    "url": "https://openreview.net/forum?id=mu0WNwWWWCE",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.11",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Contemporary neural networks are limited in their ability to learn from evolving streams of training data. When trained sequentially on new or evolving tasks, their accuracy drops sharply, making them unsuitable for many real-world applications. In this work, we shed light on the causes of this well known yet unsolved phenomenon - often referred to as catastrophic forgetting - in a class-incremental setup. We show that a combination of simple components and a loss that balances intra-task and inter-task learning can already resolve forgetting to the same extent as more complex measures proposed in literature. Moreover, we identify poor quality of the learned representation as another reason for catastrophic forgetting in class-IL. We show that performance is correlated with secondary class information (dark knowledge) and it can be improved by an appropriate regularizer. With these lessons learned, class-incremental learning results on CIFAR-100 and ImageNet improve over the state-of-the-art by a large margin, while keeping the approach simple. ",
+    "title": "Essentials for Class Incremental Learning",
+    "authors": [
+      "Sudhanshu Mittal",
+      "Silvio Galesso",
+      "Thomas Brox"
+    ],
+    "emails": [
+      "~Sudhanshu_Mittal2",
+      "~Silvio_Galesso1",
+      "~Thomas_Brox1"
+    ],
+    "rank": 1706
+  },
+  {
+    "url": "https://openreview.net/forum?id=G0VouKj9HUG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5
+    ],
+    "rating": "5.11",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "Learning functions over Boolean variables is a fundamental problem in machine learning. But not much is known about learning such functions by neural networks. Because learning these functions in the distribution free setting is NP-Hard, they are unlikely to be efficiently learnable by networks in this case. However, assuming the inputs are sampled from the uniform distribution, an important subset of functions that are known to be efficiently learnable is read-once DNFs. Here we focus on this setting where the functions are learned by a convex neural network and gradient descent. \nWe first observe empirically that the learned neurons are aligned with the terms of the DNF, despite the fact that there are many zero-error networks that do not have this property. Thus, the learning process has a clear inductive bias towards such logical formulas. To gain a better theoretical understanding of this phenomenon we focus on minimizing the population risk. We show that this risk can be minimized by multiple networks: from ones that memorize data to ones that compactly represent the DNF. We then set out to understand why gradient descent ``\"chooses\" the compact representation. \nWe use a computer assisted proof to prove the inductive bias for relatively small DNFs, and use it to design a process for reconstructing the DNF from the learned network. We then continue to provide theoretical insights on the learning process and the loss surface to better understand the resulting inductive bias. For example, we show that the neurons in solutions with minimum <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-norm of the weights are also aligned with the terms of the DNF. Finally, we empirically show that our results are validated in the empirical case for high dimensional DNFs, more general network architectures and tabular datasets.",
+    "title": "On Learning Read-once DNFs With Neural Networks",
+    "authors": [
+      "Ido Bronstein",
+      "Alon Brutzkus",
+      "Amir Globerson"
+    ],
+    "emails": [
+      "~Amir_Globerson1",
+      "~Alon_Brutzkus1",
+      "~Ido_Bronstein2"
+    ],
+    "rank": 1707
+  },
+  {
+    "url": "https://openreview.net/forum?id=I-VfjSBzi36",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      7,
+      6,
+      5
+    ],
+    "rating": "5.11",
+    "confidences": [
+      4,
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Deep, heavily overparameterized language models such as BERT, XLNet and T5 have achieved impressive success in many NLP tasks. However, their high model complexity requires enormous computation resources and extremely long training time for both pre-training and fine-tuning. Many works have studied model compression on large NLP models, but only focus on reducing inference cost/time, while still requiring expensive training process. Other works use extremely large batch sizes to shorten the pre-training time at the expense of high demand for computation resources. In this paper, inspired by the Early-Bird Lottery Tickets studied for computer vision tasks, we propose EarlyBERT, a general computationally-efficient training algorithm applicable to both pre-training and fine-tuning of large-scale language models. We are the first to identify structured winning tickets in the early stage of BERT training, and use them for efficient training. Comprehensive pre-training and fine-tuning experiments on GLUE and SQuAD downstream tasks show that EarlyBERT easily achieves comparable performance to standard BERT with 35~45% less training time.",
+    "title": "EarlyBERT: Efficient BERT Training via Early-bird Lottery Tickets",
+    "authors": [
+      "Xiaohan Chen",
+      "Yu Cheng",
+      "Shuohang Wang",
+      "Zhe Gan",
+      "Zhangyang Wang",
+      "Jingjing Liu"
+    ],
+    "emails": [
+      "~Jingjing_Liu2",
+      "~Zhangyang_Wang1",
+      "~Shuohang_Wang1",
+      "~Xiaohan_Chen1",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1"
+    ],
+    "rank": 1708
+  },
+  {
+    "url": "https://openreview.net/forum?id=whNntrHtB8D",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      3,
+      6
+    ],
+    "rating": "5.11",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Prior work on continual learning often operate in a \u201ctask-aware\u201d manner, by assuming that the task boundaries and identifies of the data examples are known at all times. While in practice, it is rarely the case that such information are exposed to the methods (i.e., thus called \u201ctask-free\u201d)\u2013a setting that is relatively underexplored. Recent attempts on task-free continual learning build on previous memory replay methods and focus on developing memory construction and replay strategies such that model performance over previously seen examples can be best retained. In this paper, looking from a complementary angle, we propose a novel approach to \u201cedit\u201d memory examples so that the edited memory can better retain past performance when they are replayed. We use gradient updates to edit memory examples so that they are more likely to be \u201cforgotten\u201d in the future. Experiments on five benchmark datasets show the proposed method can be seamlessly combined with baselines to significantly improve the performance.",
+    "title": "Gradient Based Memory Editing for Task-Free Continual Learning",
+    "authors": [
+      "Xisen Jin",
+      "Junyi Du",
+      "Xiang Ren"
+    ],
+    "emails": [
+      "~Xiang_Ren1",
+      "~Junyi_Du1",
+      "~Xisen_Jin3"
+    ],
+    "rank": 1709
+  },
+  {
+    "url": "https://openreview.net/forum?id=32B5lOqZUiO",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.11",
+    "confidences": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Designing feasible and effective architectures is essential for deploying deep models to real-world scenarios. In practice, one has to consider multiple objectives (e.g., model performance and computational cost) and diverse constraints incurred by different computation resources. To address this, most methods seek to find promising architectures via optimizing a well pre-defined utility function. However, it is often non-trivial to design an ideal function that could well trade-off different objectives. More critically, in many real scenarios, even for the same platform, we may have different applications with various latency budgets. To find promising architectures under different budgets, existing methods may have to perform an independent search for each budget, which is very inefficient and unnecessary. Nevertheless, it would be fantastic if we can produce multiple promising architectures to fulfill each budget in the same search process. In this paper, we propose a Pareto-Frontier-aware Neural Architecture Search (PFNAS) method which seeks to learn the Pareto frontier (i.e., the set of Pareto optimal architectures) w.r.t. multiple objectives. Here, we formulate the Pareto frontier learning problem as a Markov decision process (MDP). Relied on the MDP, we transform and absorb the objectives other than model performance into the constraints. To learn the whole Pareto frontier, we propose to find a set of Pareto optimal architectures which are uniformly distributed on the range of budget to form a frontier. Based on the learned frontier, we are able to easily find multiple promising architectures to fulfill all considered constraints in the same search process. Extensive experiments on three hardware platforms (i.e., mobile, CPU, and GPU) show that the searched architectures by our PFNAS outperform the ones obtained by existing methods under different budgets.",
+    "title": "Pareto-Frontier-aware Neural Architecture Search",
+    "authors": [
+      "Yong Guo",
+      "Yaofo Chen",
+      "Yin Zheng",
+      "Peilin Zhao",
+      "Jian Chen",
+      "Junzhou Huang",
+      "Mingkui Tan"
+    ],
+    "emails": [
+      "scut.edu.cn",
+      "~Yin_Zheng1",
+      "~Yong_Guo1",
+      "gmail.com",
+      "~Mingkui_Tan2",
+      "~Junzhou_Huang2",
+      "~Peilin_Zhao2"
+    ],
+    "rank": 1710
+  },
+  {
+    "url": "https://openreview.net/forum?id=0BaWDGvCa5p",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6
+    ],
+    "rating": "5.11",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "We present a first-order algorithm for nonconvex-nonconcave min-max optimization problems such as those that arise in training GANs.  Our algorithm provably converges in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c70\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6F\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c6C\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c79\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"normal\">p</mi><mi mathvariant=\"normal\">o</mi><mi mathvariant=\"normal\">l</mi><mi mathvariant=\"normal\">y</mi></mrow><mo stretchy=\"false\">(</mo><mi>d</mi><mo>,</mo><mi>L</mi><mo>,</mo><mi>b</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> steps for any loss function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3A\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-msup space=\"3\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi><mo>:</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup><mo>\u00d7</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>d</mi></msup><mo stretchy=\"false\">\u2192</mo><mrow><mi mathvariant=\"double-struck\">R</mi></mrow></math></mjx-assistive-mml></mjx-container> which is <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>b</mi></math></mjx-assistive-mml></mjx-container>-bounded with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi>L</mi></mrow></math></mjx-assistive-mml></mjx-container>-Lipschitz gradient. To achieve convergence, we 1) give a novel approximation to the global strategy of the max-player based on first-order algorithms such as gradient ascent, and 2) empower the min-player to look ahead and simulate the max-player\u2019s response for arbitrarily many steps, but restrict the min-player to move according to updates sampled from a stochastic gradient oracle. Our algorithm, when used to train GANs on synthetic and real-world datasets, does not cycle, results in GANs that seem to avoid mode collapse, and achieves a training time per iteration and memory requirement similar to gradient descent-ascent.\n",
+    "title": "A Provably Convergent and Practical Algorithm for Min-Max Optimization with Applications to GANs",
+    "authors": [
+      "Oren Mangoubi",
+      "Sushant Sachdeva",
+      "Nisheeth K Vishnoi"
+    ],
+    "emails": [
+      "~Sushant_Sachdeva1",
+      "~Nisheeth_K_Vishnoi1",
+      "~Oren_Mangoubi1"
+    ],
+    "rank": 1711
+  },
+  {
+    "url": "https://openreview.net/forum?id=qzqBl_nOeAQ",
+    "ratings": [
+      7,
+      6,
+      4,
+      4
+    ],
+    "rating": "5.11",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Most gradient-based neural architecture search methods construct a super-net for search and derive a target-net as its sub-graph for evaluation. There is a significant gap between the architectures in search and evaluation. As a result, current methods suffer from the problem of an inaccurate, inefficient, and inflexible search process. In this paper, we aim to close the gap and solve these problems. We introduce EnTranNAS that is composed of Engine-cells and Transit-cells. The Engine-cell is differentiable for architecture search, while the Transit-cell only transits the current sub-graph by architecture derivation. Consequently, the gap between the architectures in search and evaluation is significantly reduced. Our method also spares much memory and computation cost, which speeds up the search process. A feature sharing strategy is introduced for more efficient parameter training in the search phase. Furthermore, we develop a new architecture derivation method to replace the traditional one that is based on a hand-crafted rule. Our method enables differentiable sparsification, so keeps the derived architecture equivalent to the one in search. Besides, it supports the search for topology where a node can be connected to prior nodes with any number of connections, so that the searched architectures could be more flexible. For experiments on CIFAR-10, our search on the standard space requires only 0.06 GPU-day. We further have an error rate of 2.22% with 0.07 GPU-day for the search on an extended space. We can directly perform our search on ImageNet with topology learnable and achieve a top-1 error rate of 23.2%.  Code will be released.",
+    "title": "EnTranNAS: Towards Closing the Gap between the Architectures in Search and Evaluation",
+    "authors": [
+      "Yibo Yang",
+      "Shan You",
+      "Hongyang Li",
+      "Fei Wang",
+      "Chen Qian",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Chen_Qian1",
+      "~Hongyang_Li2",
+      "~Zhouchen_Lin1",
+      "~Yibo_Yang2",
+      "~Shan_You3",
+      "~Fei_Wang9"
+    ],
+    "rank": 1712
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ao2-JgYxuQf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      8
+    ],
+    "rating": "5.10",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "We introduce Active Tuning, a novel paradigm for optimizing the internal dynamics of recurrent neural networks (RNNs) on the fly. In contrast to the conventional sequence-to-sequence mapping scheme, Active Tuning decouples the RNN's recurrent neural activities from the input stream, using the unfolding temporal gradient signal to tune the internal dynamics into the data stream. As a consequence, the model output depends only on its internal hidden dynamics and the closed-loop feedback of its own predictions; its hidden state is continuously adapted by means of the temporal gradient resulting from backpropagating the discrepancy between the signal observations and the model outputs through time. In this way, Active Tuning infers the signal actively but indirectly based on the originally learned temporal patterns, fitting the most plausible hidden state sequence into the observations. We demonstrate the effectiveness of Active Tuning on several time series prediction benchmarks, including multiple super-imposed sine waves, a chaotic double pendulum, and spatiotemporal wave dynamics. Active Tuning consistently improves the robustness, accuracy, and generalization abilities of all evaluated models. Moreover, networks trained for signal prediction and denoising can be successfully applied to a much larger range of noise conditions with the help of Active Tuning. Thus, given a capable time series predictor, Active Tuning enhances its online signal filtering, denoising, and reconstruction abilities without the need for additional training.",
+    "title": "Active Tuning",
+    "authors": [
+      "Sebastian Otte",
+      "Matthias Karlbauer",
+      "Martin V. Butz"
+    ],
+    "emails": [
+      "uni-tuebingen.de",
+      "~Matthias_Karlbauer1",
+      "~Sebastian_Otte1"
+    ],
+    "rank": 1713
+  },
+  {
+    "url": "https://openreview.net/forum?id=ml1LSu49FLZ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      4
+    ],
+    "rating": "5.09",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Training on disjoint fixed-length segments, Transformers successfully transform static word embeddings into contextualized word representations. However, they often restrict the context of a token to the segment it resides in and hence neglect the flow of contextual information across segments, failing to capture longer-term dependencies beyond the predefined segment length. This paper uses a probabilistic deep topic model to provide contextualized embeddings at both the token and segment levels. It also introduces topic self-attention and a contextual next-word embedding guided topic select-attention, injecting contextualized topic information into Transformer-based architectures. Moving beyond conventional Transformers that ignore longer-range word dependencies and contextualize their word representations at the segment level, the proposed method not only captures global semantic coherence of all segments and global word concurrence patterns, but also enriches the representation of each token by adapting it to its local context, which is not limited to the segment it resides in and can be flexibly defined according to the task. Experiments on various corpora show that adding only a few extra parameters, the proposed topic-aware contextualized transformers consistently outperform their conventional counterparts, and can be used to generate coherent sentences and paragraphs.",
+    "title": "Topic-aware Contextualized Transformers",
+    "authors": [
+      "Ruiying Lu",
+      "Bo Chen",
+      "Dan dan Guo",
+      "Dongsheng Wang",
+      "Mingyuan Zhou"
+    ],
+    "emails": [
+      "~Dan_dan_Guo1",
+      "~Ruiying_Lu1",
+      "~Bo_Chen1",
+      "163.com",
+      "~Mingyuan_Zhou1"
+    ],
+    "rank": 1714
+  },
+  {
+    "url": "https://openreview.net/forum?id=gRr_gt5bker",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4
+    ],
+    "rating": "5.09",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Multi-agent imitation learning aims to train multiple agents to perform tasks from demonstrations by learning a mapping between observations and actions, which is essential for understanding physical, social, and team-play systems. However, most existing works on modeling multi-agent interactions typically assume that agents make independent decisions based on their observations, ignoring the complex dependence among agents. In this paper, we propose to use copula, a powerful statistical tool for capturing dependence among random variables, to explicitly model the correlation and coordination in multi-agent systems. Our proposed model is able to separately learn marginals that capture the local behavioral patterns of each individual agent, as well as a copula function that solely and fully captures the dependence structure among agents. Extensive experiments on synthetic and real-world datasets show that our model outperforms state-of-the-art baselines across various scenarios in the action prediction task, and is able to generate new trajectories close to expert demonstrations.",
+    "title": "Multi-Agent Imitation Learning with Copulas",
+    "authors": [
+      "Hongwei Wang",
+      "Lantao Yu",
+      "Zhangjie Cao",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Lantao_Yu2",
+      "~Stefano_Ermon1",
+      "~Hongwei_Wang1",
+      "~Zhangjie_Cao1"
+    ],
+    "rank": 1715
+  },
+  {
+    "url": "https://openreview.net/forum?id=McYsRk9-rso",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "A fundamental shortcoming of deep neural networks is their specialization to a single task and domain. While recent techniques in multi-domain learning enable the learning of more domain-agnostic features, their success relies firmly on the presence of domain labels, typically requiring manual annotation and careful curation of datasets. Here we focus on latent domain learning, a highly realistic, yet less explored scenario: learning from data from different domains, without access to domain annotations. This is a particularly challenging problem, since standard models exhibit an implicit bias toward learning only the large domains in data, while disregarding smaller ones. To address this issue, we propose dynamic residual adapters that adaptively account for latent domains, and weighted domain transfer \u2013 a novel augmentation strategy designed specifically for this setting. Our techniques are evaluated on image classification tasks containing multiple unannotated domains, and we demonstrate they enhance performance, in particular, on the smallest of these.",
+    "title": "Reducing Implicit Bias in Latent Domain Learning",
+    "authors": [
+      "Lucas Deecke",
+      "Timothy Hospedales",
+      "Hakan Bilen"
+    ],
+    "emails": [
+      "~Hakan_Bilen1",
+      "~Lucas_Deecke1",
+      "~Timothy_Hospedales1"
+    ],
+    "rank": 1716
+  },
+  {
+    "url": "https://openreview.net/forum?id=vY0bnzBBvtr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.08",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Motivated by the episodic version of the classical inventory control problem, we propose a new Q-learning-based algorithm, Elimination-Based Half-Q-Learning (HQL), that enjoys improved efficiency over existing algorithms for a wide variety of problems in the one-sided-feedback setting. We also provide a simpler variant of the algorithm, Full-Q-Learning (FQL), for the full-feedback setting. We establish that HQL incurs <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.148em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mn>3</mn></msup><msqrt><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret and FQL incurs <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.148em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.053em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.169em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>H</mi><mn>2</mn></msup><msqrt><mi>T</mi></msqrt><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> regret, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>H</mi></math></mjx-assistive-mml></mjx-container> is the length of each episode and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the total length of the horizon. The regret bounds are not affected by the possibly huge state and action space. Our numerical experiments demonstrate the superior efficiency of HQL and FQL, and the potential to combine reinforcement learning with richer feedback models.",
+    "title": "Provably More Efficient Q-Learning in the One-Sided-Feedback/Full-Feedback Settings",
+    "authors": [
+      "Xiao-Yue Gong",
+      "David Simchi-Levi"
+    ],
+    "emails": [
+      "~David_Simchi-Levi2",
+      "~Xiao-Yue_Gong1"
+    ],
+    "rank": 1717
+  },
+  {
+    "url": "https://openreview.net/forum?id=_MxHo0GHsH6",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Automatic search of Quantized Neural Networks (QNN) has attracted a lot of attention. However, the existing quantization-aware Neural Architecture Search (NAS) approaches inherit a two-stage search-retrain schema, which is not only time-consuming but also adversely affected by the unreliable ranking of architectures during the search. To avoid the undesirable effect of the search-retrain schema, we present Once Quantized for All (OQA), a novel framework that searches for quantized compact models and deploys their quantized weights at the same time without additional post-process. While supporting a huge architecture search space, our OQA can produce a series of quantized compact models under ultra-low bit-widths(e.g. 4/3/2 bit). A progressive bit inheritance procedure is introduced to support ultra-low bit-width. Our searched model family, OQANets, achieves a new state-of-the-art (SOTA) on quantized compact models compared with various quantization methods and bit-widths. In particular, OQA2bit-L achieves 64.0\\% ImageNet Top-1 accuracy, outperforming its 2 bit counterpart EfficientNet-B0@QKD by a large margin of 14\\%  using 30\\% less computation cost. ",
+    "title": "Once Quantized for All: Progressively Searching for Quantized Compact Models",
+    "authors": [
+      "Mingzhu Shen",
+      "Feng Liang",
+      "Chuming Li",
+      "Chen Lin",
+      "Ming Sun",
+      "Junjie Yan",
+      "Wanli Ouyang"
+    ],
+    "emails": [
+      "~Wanli_Ouyang1",
+      "~Junjie_Yan4",
+      "~Ming_Sun4",
+      "~Feng_Liang3",
+      "~Chuming_Li1",
+      "~Chen_Lin2",
+      "~Mingzhu_Shen1"
+    ],
+    "rank": 1718
+  },
+  {
+    "url": "https://openreview.net/forum?id=nQxCYIFk7Rz",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.08",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper explores the generalization loss of linear regression in variably parameterized families of models, both under-parameterized and over-parameterized. We show that the generalization curve can have an arbitrary number of peaks, and moreover, locations of those peaks can be explicitly controlled. Our results highlight the fact that both classical U-shaped generalization curve and the recently observed double descent curve are not intrinsic properties of the model family. Instead, their emergence is due to the interaction between the properties of the data and the inductive biases of learning algorithms. ",
+    "title": "Multiple Descent: Design Your Own Generalization Curve",
+    "authors": [
+      "Lin Chen",
+      "Yifei Min",
+      "Mikhail Belkin",
+      "amin karbasi"
+    ],
+    "emails": [
+      "~Yifei_Min1",
+      "~Lin_Chen14",
+      "~amin_karbasi1",
+      "ucsd.edu"
+    ],
+    "rank": 1719
+  },
+  {
+    "url": "https://openreview.net/forum?id=EKb4Z0aSNf",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      7,
+      7
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Deep learning algorithms are known to experience destructive interference when instances violate the assumption of being independent and identically distributed (i.i.d). This violation, however, is ubiquitous in clinical settings where data are streamed temporally and from a multitude of physiological sensors. To overcome this obstacle, we propose CLOPS, a replay-based continual learning strategy. In three continual learning scenarios based on three publically-available datasets, we show that CLOPS can outperform the state-of-the-art methods, GEM and MIR. Moreover, we propose end-to-end trainable parameters, which we term task-instance parameters, that can be used to quantify task difficulty and similarity. This quantification yields insights into both network interpretability and clinical applications, where task difficulty is poorly quantified.",
+    "title": "CLOPS: Continual Learning of Physiological Signals",
+    "authors": [
+      "Dani Kiyasseh",
+      "Tingting Zhu",
+      "David A. Clifton"
+    ],
+    "emails": [
+      "~Dani_Kiyasseh1",
+      "~David_A._Clifton1",
+      "eng.ox.ac.uk"
+    ],
+    "rank": 1720
+  },
+  {
+    "url": "https://openreview.net/forum?id=ey1XXNzcIZS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6
+    ],
+    "rating": "5.08",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Sparsely-Gated Mixture-of-Experts (MoE) has been a successful approach for scaling multilingual translation models to billions of parameters without a proportional increase in training computation. These models, however, are prohibitively large for serving deployment and there is no easy way to extract a sub-network to decode for a particular language pair. This work proposes improved strategies to route MoE models by tasks instead of tokens, thus enabling separation of network structures at decoding time while enjoying the benefits of scale and task sharing at training time.\nWe compare routing strategies at multiple levels (token, sentence, task) in both, the encoder and the decoder, and conduct extensive experiments on two benchmarks: the public WMT dataset of 30 language pairs and an in-house web-scale dataset of 200 language pairs. On WMT, with a Transformer base model with 32 experts, our task-level MoE outperforms the best performing token-level MoE model by +1.0 BLEU on average over all language pairs. When scaling up to Transformer big model with 128 experts on the large-scale massively multilingual benchmark, our task-level MoE is competitive with token-level MoE while being able to reduce the decoder model size by a factor of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>32.34</mn></math></mjx-assistive-mml></mjx-container> and increase peak throughput by 2.6 times at inference.",
+    "title": "Exploring Routing Strategies for Multilingual Mixture-of-Experts Models",
+    "authors": [
+      "Sneha Kudugunta",
+      "Yanping Huang",
+      "Ankur Bapna",
+      "Maxim Krikun",
+      "Dmitry Lepikhin",
+      "Thang Luong",
+      "Orhan Firat"
+    ],
+    "emails": [
+      "~Sneha_Kudugunta1",
+      "~Orhan_Firat1",
+      "google.com",
+      "~Ankur_Bapna1",
+      "~Thang_Luong1",
+      "~Yanping_Huang1"
+    ],
+    "rank": 1721
+  },
+  {
+    "url": "https://openreview.net/forum?id=wG5XIGi6nrt",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "How can we learn a representation with good predictive power while preserving user privacy?\nWe present an adversarial representation learning method to sanitize sensitive content from the representation in an adversarial fashion.\nSpecifically, we propose focal entropy - a variant of entropy embedded in an adversarial representation learning setting to leverage privacy sanitization. Focal entropy enforces maximum uncertainty in terms of confusion on the subset of privacy-related similar classes, separated from the dissimilar ones. As such, our proposed sanitization method yields deep sanitization of private features yet is conceptually simple and empirically powerful. We showcase feasibility in terms of classification of facial attributes and identity on the CelebA dataset as well as CIFAR-100. The results suggest that private components can be removed reliably. ",
+    "title": "Learning Private Representations with Focal Entropy",
+    "authors": [
+      "Tassilo Klein",
+      "Moin Nabi"
+    ],
+    "emails": [
+      "~Tassilo_Klein1",
+      "~Moin_Nabi1"
+    ],
+    "rank": 1722
+  },
+  {
+    "url": "https://openreview.net/forum?id=DHkGKg2fJay",
+    "ratings": [
+      6,
+      3,
+      7,
+      4
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "As an important branch of weakly supervised learning, partial label learning deals with data where each instance is assigned with a set of candidate labels, whereas only one of them is true. In this paper, we propose a family of loss functions named Leveraged Weighted (LW) loss function, which for the first time introduces the leverage parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> to partial loss functions to leverage between losses on partial labels and residual labels (non-partial labels). Under mild assumptions, we achieve the relationship between the partial loss function and its corresponding ordinary loss that leads to the consistency in risk. Compared to the existing literatures, our result applies to both deterministic and stochastic scenarios, considers the loss functions of a more general form, and takes milder assumptions on the distribution of the partial label set. As special cases, with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi><mo>=</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi><mo>=</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container>, the corresponding ordinary losses of our LW loss respectively match the binary classification loss and the \\textit{one-versus-all} (OVA) loss function. In this way, our theorems successfully explain the experimental results on parameter analysis, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi><mo>=</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container> and especially <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi><mo>=</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container> are considered as preferred choices for the leverage parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>. Last but not least, real data comparisons show the high effectiveness of our LW loss over other state-of-the-art partial label learning algorithms.",
+    "title": "Leveraged Weighted Loss For Partial Label Learning",
+    "authors": [
+      "Hongwei Wen",
+      "Hanyuan Hang",
+      "Jiabin Liu",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Hanyuan_Hang1",
+      "~Zhouchen_Lin1",
+      "~Hongwei_Wen1",
+      "~Jiabin_Liu1"
+    ],
+    "rank": 1723
+  },
+  {
+    "url": "https://openreview.net/forum?id=TSrvUnWkjGR",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      7
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "Deep generative models (e.g. GANs and VAEs) have been developed quite extensively in recent years. Lately, there has been an increased interest in the inversion of such a model, i.e. given a (possibly corrupted) signal, we wish to recover the latent vector that generated it. Building upon sparse representation theory, we define conditions that rely only on the cardinalities of the hidden layer and are applicable to any inversion algorithm (gradient descent, deep encoder, etc.), under which such generative models are invertible with a unique solution. Importantly, the proposed analysis is applicable to any trained model, and does not depend on Gaussian i.i.d. weights. Furthermore, we introduce two layer-wise inversion pursuit algorithms for trained generative networks of arbitrary depth, where one of them is accompanied by recovery guarantees. Finally, we validate our theoretical results numerically and show that our method outperforms gradient descent when inverting such generators, both for clean and corrupted signals.",
+    "title": "On the Inversion of Deep Generative Models",
+    "authors": [
+      "Aviad Aberdam",
+      "Dror Simon",
+      "Michael Elad"
+    ],
+    "emails": [
+      "~Aviad_Aberdam1",
+      "~Dror_Simon1",
+      "~Michael_Elad1"
+    ],
+    "rank": 1724
+  },
+  {
+    "url": "https://openreview.net/forum?id=JI2TGOehNT0",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.08",
+    "confidences": [
+      2,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Imitation Learning (IL) and Reinforcement Learning (RL) from high dimensional sensory inputs are often introduced as separate problems, but a more realistic problem setting is how to merge the techniques so that the agent can reduce exploration costs by partially imitating experts at the same time it maximizes its return. Even when the experts are suboptimal (e.g. Experts learned halfway with other RL methods or human-crafted experts), it is expected that the agent outperforms the suboptimal experts\u2019 performance. In this paper, we propose to address the issue by using and theoretically extending Free Energy Principle, a unified brain theory that explains perception, action and model learning in a Bayesian probabilistic way. We find that both IL and RL can be achieved based on the same free energy objective function. Our results show that our approach is promising in visual control tasks especially with sparse-reward environments.",
+    "title": "Combining Imitation and Reinforcement Learning with Free Energy Principle",
+    "authors": [
+      "Ryoya Ogishima",
+      "Izumi Karino",
+      "Yasuo Kuniyoshi"
+    ],
+    "emails": [
+      "~Yasuo_Kuniyoshi1",
+      "~Ryoya_Ogishima1",
+      "isi.imi.i.u-tokyo.ac.jp"
+    ],
+    "rank": 1725
+  },
+  {
+    "url": "https://openreview.net/forum?id=uz5uw6gM0m",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      5,
+      6,
+      3,
+      7
+    ],
+    "rating": "5.08",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Can deep learning solve multiple, very different tasks simultaneously? We investigate how the representations of the underlying tasks affect the ability of a single neural network to learn them jointly. We present theoretical and empirical findings that a single neural network is capable of simultaneously learning multiple tasks from a combined data set, for a variety of methods for representing tasks---for example, when the distinct tasks are encoded by well-separated clusters or decision trees over some task-code attributes. Indeed, more strongly, we present a novel analysis that shows that families of simple programming-like constructs for the codes encoding the tasks are learnable by two-layer neural networks with standard training. We study more generally how the complexity of learning such combined tasks grows with the complexity of the task codes; we find that learning many tasks can be provably hard, even though the individual tasks are easy to learn. We provide empirical support for the usefulness of the learning bounds by training networks on clusters, decision trees, and SQL-style aggregation.",
+    "title": "One Network Fits All? Modular versus Monolithic Task Formulations in Neural Networks",
+    "authors": [
+      "Atish Agarwala",
+      "Abhimanyu Das",
+      "Brendan Juba",
+      "Rina Panigrahy",
+      "Vatsal Sharan",
+      "Xin Wang",
+      "Qiuyi Zhang"
+    ],
+    "emails": [
+      "~Brendan_Juba1",
+      "~Vatsal_Sharan1",
+      "~Rina_Panigrahy1",
+      "google.com",
+      "~Qiuyi_Zhang1"
+    ],
+    "rank": 1726
+  },
+  {
+    "url": "https://openreview.net/forum?id=mDAZVlBeXWx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      6
+    ],
+    "rating": "5.08",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "There has been growing interest in representation learning for text data, based on theoretical arguments and empirical evidence. One important direction involves leveraging contrastive learning to improve learned representations. We propose an application of contrastive learning for intermediate textual feature pairs, to explicitly encourage the model to learn more distinguishable representations. To overcome the learner's degeneracy due to vanishing contrasting signals, we impose Wasserstein constraints on the critic via spectral regularization.\n    Finally, to moderate such an objective from overly regularized training and to enhance learning efficiency, with theoretical justification, we further leverage an active negative-sample-selection procedure to only use high-quality contrast examples.  We evaluate the proposed method over a wide range of natural language processing applications, from the perspectives of both supervised and unsupervised learning. Empirical results show consistent improvement over baselines. ",
+    "title": "Towards Robust and Efficient Contrastive Textual Representation Learning",
+    "authors": [
+      "Liqun Chen",
+      "Yizhe Zhang",
+      "Dianqi Li",
+      "Chenyang Tao",
+      "Dong Wang",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "~Yizhe_Zhang2",
+      "~Chenyang_Tao1",
+      "~Dianqi_Li1",
+      "~Liqun_Chen2",
+      "~Lawrence_Carin2",
+      "~Dong_Wang2"
+    ],
+    "rank": 1727
+  },
+  {
+    "url": "https://openreview.net/forum?id=jz7tDvX6XYR",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "It has been widely observed that increasing deep learning model sizes often leads to significant performance improvements on a variety of natural language processing and computer vision tasks. In the meantime, however, computational costs and training time would dramatically increase when models get larger. In this paper, we propose a simple approach to speed up training for a particular kind of deep networks which contain repeated structures,  such as the transformer module. In our method, we first train such a deep network with the weights shared across all the repeated layers till some point. We then stop weight sharing and continue training until convergence. The untying point is automatically determined by monitoring gradient statistics. Our adaptive untying criterion is obtained from a theoretic analysis over deep linear networks.  Empirical results show that our method is able to reduce the training time of BERT  by 50%. ",
+    "title": "Speeding up Deep Learning Training by Sharing Weights and Then Unsharing",
+    "authors": [
+      "Shuo Yang",
+      "Le Hou",
+      "Xiaodan Song",
+      "qiang liu",
+      "Denny Zhou"
+    ],
+    "emails": [
+      "~qiang_liu4",
+      "~Le_Hou1",
+      "~Xiaodan_Song1",
+      "~Denny_Zhou1",
+      "~Shuo_Yang6"
+    ],
+    "rank": 1728
+  },
+  {
+    "url": "https://openreview.net/forum?id=SzjyTIc5qMP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Information Lattice Learning (ILL) is a general framework to learn decomposed representations, called rules, of a signal such as an image or a probability distribution. Each rule is a coarsened signal used to gain some human-interpretable insight into what might govern the nature of the original signal. To summarize the signal, we need several disentangled rules arranged in a hierarchy, formalized by a lattice structure. ILL focuses on explainability and generalizability from \"small data\", and aims for rules akin to those humans distill from experience (rather than a representation optimized for a specific task like classification). This paper focuses on a mathematical and algorithmic presentation of ILL, then demonstrates how ILL addresses the core question \"what makes X an X\" or \"what makes X different from Y\" to create effective, rule-based explanations designed to help human learners understand. The key part here is \\emph{what} rather than tasks like generating X or predicting labels X,Y. Typical applications of ILL are presented for artistic and scientific knowledge discovery. These use ILL to learn music theory from scores and chemical laws from molecule data, revealing relationships between domains. We include initial benchmarks and assessments for ILL to demonstrate efficacy.",
+    "title": "Information Lattice Learning",
+    "authors": [
+      "Haizi Yu",
+      "James Evans",
+      "Lav R. Varshney"
+    ],
+    "emails": [
+      "~Haizi_Yu1",
+      "~Lav_R._Varshney1",
+      "~James_Evans1"
+    ],
+    "rank": 1729
+  },
+  {
+    "url": "https://openreview.net/forum?id=9p2CltauWEY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      5
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) can process graphs of different sizes but their capacity to generalize across sizes is still not well understood. Size generalization is key to numerous GNN applications, from solving combinatorial optimization problems to learning in molecular biology. In such problems, obtaining labels and training on large graphs can be prohibitively expensive, but training on smaller graphs is possible. \n\nThis paper puts forward the size-generalization question and characterizes important aspects of that problem theoretically and empirically.\nWe prove that even for very simple tasks, such as counting the number of nodes or edges in a graph, GNNs do not naturally generalize to graphs of larger size. Instead, their generalization performance is closely related to the distribution of local patterns of connectivity and features and how that distribution changes from small to large graphs. Specifically, we prove that for many tasks, there are weight assignments for GNNs that can perfectly solve the task on small graphs but fail on large graphs, if there is a discrepancy between their local patterns. We further demonstrate on several tasks, that training GNNs on small graphs results in solutions which do not generalize to larger graphs. We then formalize size generalization as a domain-adaption problem and describe two learning setups where size generalization can be improved. First, as a self-supervised learning problem (SSL) over the target domain of large graphs. Second as a semi-supervised learning problem when few samples are available in the target domain. We demonstrate the efficacy of these solutions on a diverse set of benchmark graph datasets. ",
+    "title": "On Size Generalization in Graph Neural Networks",
+    "authors": [
+      "Gilad Yehudai",
+      "Ethan Fetaya",
+      "Eli Meirom",
+      "Gal Chechik",
+      "Haggai Maron"
+    ],
+    "emails": [
+      "~Gal_Chechik1",
+      "~Ethan_Fetaya1",
+      "~Gilad_Yehudai2",
+      "~Eli_Meirom2",
+      "~Haggai_Maron1"
+    ],
+    "rank": 1730
+  },
+  {
+    "url": "https://openreview.net/forum?id=V3o2w-jDeT5",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6,
+      5
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "How can we conduct efficient hyperparameter optimization for a completely new task? In this work, we consider a novel setting, where we search for the optimal hyperparameters for a target task of interest using only unlabeled target task and \u2018somewhat relevant\u2019 source task datasets. In this setting, it is essential to estimate the ground-truth target task objective using only the available information. We propose estimators to unbiasedly approximate the ground-truth with a desirable variance property. Building on these estimators, we provide a general and tractable hyperparameter optimization procedure for our setting. The experimental evaluations demonstrate that the proposed framework broadens the applications of automated hyperparameter optimization.",
+    "title": "Multi-Source Unsupervised Hyperparameter Optimization",
+    "authors": [
+      "Masahiro Nomura",
+      "Yuta Saito"
+    ],
+    "emails": [
+      "~Yuta_Saito1",
+      "~Masahiro_Nomura1"
+    ],
+    "rank": 1731
+  },
+  {
+    "url": "https://openreview.net/forum?id=8_Ve-wi_IOx",
+    "ratings": [
+      4,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      5,
+      4,
+      2,
+      4
+    ],
+    "abstract": "We develop an interpretable and learnable Wigner-Ville distribution that produces a super-resolved quadratic signal representation for time-series analysis.\nOur approach has two main hallmarks. \nFirst, it interpolates between known time-frequency representations (TFRs) in that it can reach super-resolution with increased time and frequency resolution beyond what the Heisenberg uncertainty principle prescribes and thus beyond commonly employed TFRs, \nSecond, it is interpretable thanks to an explicit low-dimensional and physical parametrization of the Wigner-Ville distribution.\nWe demonstrate that our approach is able to learn highly adapted TFRs and is ready and able to tackle various large-scale classification tasks, where we reach state-of-the-art performance compared to baseline and learned TFRs.",
+    "title": "Interpretable Super-Resolution via a Learned Time-Series Representation",
+    "authors": [
+      "Randall Balestriero",
+      "Herv\u00e9 Glotin",
+      "Richard Baraniuk"
+    ],
+    "emails": [
+      "~Herv\u00e9_Glotin1",
+      "~Richard_Baraniuk1",
+      "~Randall_Balestriero1"
+    ],
+    "rank": 1732
+  },
+  {
+    "url": "https://openreview.net/forum?id=a8OpoWkLA1",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Convolutional Neural Networks (CNNs) are known to be significantly over-parametrized, and difficult to interpret, train and adapt. In this paper, we introduce a structural regularization across convolutional kernels in a CNN. In our approach, each convolution kernel is first decomposed as 2D dictionary atoms linearly combined by coefficients. The widely observed correlation and redundancy in a CNN hint a common low-rank structure among the decomposed coefficients, which is here further supported by our empirical observations. We then explicitly regularize CNN kernels by enforcing decomposed coefficients to be shared across sub-structures, while leaving each sub-structure only its own dictionary atoms, a few hundreds of parameters typically, which leads to dramatic model reductions. We explore models with sharing across different sub-structures to cover a wide range of trade-offs between parameter reduction and expressiveness. Our proposed regularized network structures open the door to better interpreting, training and adapting deep models. We validate the flexibility and compatibility of our method by image classification experiments on multiple datasets and underlying network structures, and show that CNNs now maintain performance with dramatic reduction in parameters and computations, e.g., only 5\\% parameters are used in a ResNet-18 to achieve comparable performance. Further experiments on few-shot classification show that faster and more robust task adaptation is obtained in comparison with models with standard convolutions.",
+    "title": "ACDC: Weight Sharing in Atom-Coefficient Decomposed Convolution",
+    "authors": [
+      "Ze Wang",
+      "Xiuyuan Cheng",
+      "Guillermo Sapiro",
+      "Qiang Qiu"
+    ],
+    "emails": [
+      "~Ze_Wang3",
+      "~Qiang_Qiu1",
+      "~Guillermo_Sapiro1",
+      "~Xiuyuan_Cheng1"
+    ],
+    "rank": 1733
+  },
+  {
+    "url": "https://openreview.net/forum?id=NyQedovJwAS",
+    "ratings": [
+      4,
+      5,
+      4,
+      7
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Anomaly detection methods require high-quality features. One way of obtaining strong features is to adapt pre-trained features to anomaly detection on the target distribution. Unfortunately, simple adaptation methods often result in catastrophic collapse (feature deterioration) and reduce performance. DeepSVDD combats collapse by removing biases from architectures, but this limits the adaptation performance gain. In this work, we propose two methods for combating collapse: i) a variant of early stopping that dynamically learns the stopping iteration ii) elastic regularization inspired by continual learning. In addition, we conduct a thorough investigation of Imagenet-pretrained features for one-class anomaly detection. Our method, PANDA, outperforms the state-of-the-art in the one-class and outlier exposure settings (CIFAR10: 96.2% vs. 90.1% and 98.9% vs. 95.6%) .",
+    "title": "PANDA - Adapting Pretrained Features for Anomaly Detection",
+    "authors": [
+      "Tal Reiss",
+      "Niv Cohen",
+      "Liron Bergman",
+      "Yedid Hoshen"
+    ],
+    "emails": [
+      "~Yedid_Hoshen3",
+      "~Niv_Cohen1",
+      "~Liron_Bergman1",
+      "mail.huji.ac.il"
+    ],
+    "rank": 1734
+  },
+  {
+    "url": "https://openreview.net/forum?id=5WcLI0e3cAY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Existing pre-trained language models (PLMs) have demonstrated the effectiveness of self-supervised learning for a broad range of natural language processing (NLP) tasks. However, most of them are not explicitly aware of domain-specific knowledge, which is essential for downstream tasks in many domains, such as tasks in e-commerce scenarios. In this paper,&nbsp;we propose K-PLUG, a knowledge-injected pre-trained language model based on the encoder-decoder transformer that can be transferred to both natural language understanding and generation tasks. We verify our method in a diverse range of e-commerce scenarios that require domain-specific knowledge. Specifically, we propose five knowledge-aware self-supervised pre-training objectives to formulate the learning of domain-specific knowledge, including e-commerce domain-specific knowledge-bases, aspects of product entities,  categories of product entities, and unique selling propositions of product entities. K-PLUG achieves new state-of-the-art results on a suite of domain-specific NLP tasks, including product knowledge base completion, abstractive product summarization, and multi-turn dialogue, significantly outperforms baselines across the board, which demonstrates that the proposed method effectively learns a diverse set of domain-specific knowledge for both language understanding and generation tasks. The code,  data, and models will be publicly available.\n",
+    "title": "K-PLUG: KNOWLEDGE-INJECTED PRE-TRAINED LANGUAGE MODEL FOR NATURAL LANGUAGE UNDERSTANDING AND GENERATION",
+    "authors": [
+      "Song Xu",
+      "Haoran Li",
+      "Peng Yuan",
+      "Yujia Wang",
+      "Youzheng Wu",
+      "Xiaodong He",
+      "Ying Liu",
+      "Bowen Zhou"
+    ],
+    "emails": [
+      "jd.com",
+      "~Xiaodong_He1",
+      "~Haoran_Li1",
+      "ruc.edu.cn",
+      "~Bowen_Zhou1",
+      "~Youzheng_Wu2"
+    ],
+    "rank": 1735
+  },
+  {
+    "url": "https://openreview.net/forum?id=GJnpCsLQThe",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      4,
+      5
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In the paper, we study a class of useful non-convex minimax optimization problems on  Riemanian manifolds and propose a class of Riemanian gradient descent ascent algorithms to solve these minimax problems. Specifically, we  propose a new\nRiemannian gradient descent ascent (RGDA) algorithm for the deterministic minimax optimization.\nMoreover, we prove that the RGDA has a sample complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>\u03ba</mi><mn>2</mn></msup><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> for finding an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-stationary point of the nonconvex strongly-concave minimax problems, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03ba</mi></math></mjx-assistive-mml></mjx-container> denotes the condition number.\nAt the same time, we introduce a Riemannian stochastic gradient descent ascent (RSGDA) algorithm for the stochastic minimax optimization. In the theoretical analysis, we prove that the RSGDA can achieve a sample complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>\u03ba</mi><mn>4</mn></msup><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>4</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>.\nTo further reduce the sample complexity, we propose a novel momentum variance-reduced Riemannian stochastic gradient descent ascent (MVR-RSGDA) algorithm based on a new momentum variance-reduced technique of STORM. We prove that the MVR-RSGDA algorithm achieves a lower sample complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>\u03ba</mi><mrow><mn>4</mn></mrow></msup><msup><mi>\u03f5</mi><mrow><mo>\u2212</mo><mn>3</mn></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> without large batches, which reaches near the best known sample complexity for its Euclidean counterparts. Extensive experimental results on the robust deep neural networks training over Stiefel manifold demonstrate the efficiency of our proposed algorithms.",
+    "title": "Gradient Descent Ascent for Min-Max Problems on Riemannian Manifolds",
+    "authors": [
+      "Feihu Huang",
+      "Shangqian Gao",
+      "Heng Huang"
+    ],
+    "emails": [
+      "~Shangqian_Gao1",
+      "~Feihu_Huang1",
+      "~Heng_Huang1"
+    ],
+    "rank": 1736
+  },
+  {
+    "url": "https://openreview.net/forum?id=sjGBjudWib",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "The attention mechanism has demonstrated superior performance for inference over nodes in graph neural networks (GNNs), however, they result in a high computational burden during both training and inference. We propose FastGAT, a method to make attention based GNNs lightweight by using spectral sparsification to generate an optimal pruning of the input graph. This results in a per-epoch time that is almost linear in the number of graph nodes as opposed to quadratic. Further, we provide a re-formulation of a specific attention based GNN, Graph Attention Network (GAT) that interprets it as a graph convolution method using the random walk normalized graph Laplacian. Using this framework, we theoretically prove that spectral sparsification preserves the features computed by the GAT model, thereby justifying our FastGAT algorithm. We experimentally evaluate FastGAT on several large real world graph datasets for node classification tasks, FastGAT can dramatically reduce (up to 10x) the computational time and memory requirements, allowing the usage of attention based GNNs on large graphs.",
+    "title": "FAST GRAPH ATTENTION NETWORKS USING EFFECTIVE RESISTANCE BASED GRAPH SPARSIFICATION",
+    "authors": [
+      "Rakshith Sharma Srinivasa",
+      "Cao Xiao",
+      "Lucas Glass",
+      "Justin Romberg",
+      "Jimeng Sun"
+    ],
+    "emails": [
+      "~Cao_Xiao2",
+      "~Justin_Romberg1",
+      "gmail.com",
+      "iqvia.com",
+      "~Rakshith_Sharma_Srinivasa1"
+    ],
+    "rank": 1737
+  },
+  {
+    "url": "https://openreview.net/forum?id=JywMsiz_NtO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Many biochemical applications such as molecular property prediction require models to generalize beyond their training domains (environments). Moreover, natural environments in these tasks are structured, defined by complex descriptors such as molecular scaffolds or protein families. Therefore, most environments are either never seen during training, or contain only a single training example. To address these challenges, we propose a new regret minimization (RGM) algorithm and its extension for structured environments. RGM builds from invariant risk minimization (IRM) by recasting simultaneous optimality condition in terms of predictive regret, finding a representation that enables the predictor to compete against an oracle with hindsight access to held-out environments. The structured extension adaptively highlights variation due to complex environments via specialized domain perturbations. We evaluate our method on multiple applications: molecular property prediction, protein homology and stability prediction and show that RGM significantly outperforms previous state-of-the-art baselines.",
+    "title": "Enforcing Predictive Invariance across Structured Biomedical Domains",
+    "authors": [
+      "Wengong Jin",
+      "Regina Barzilay",
+      "Tommi S. Jaakkola"
+    ],
+    "emails": [
+      "~Regina_Barzilay1",
+      "~Wengong_Jin1",
+      "~Tommi_S._Jaakkola1"
+    ],
+    "rank": 1738
+  },
+  {
+    "url": "https://openreview.net/forum?id=mYNfmvt8oSv",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      8,
+      4,
+      4
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "While improvements in deep learning architectures have played a crucial role in improving the state of supervised and unsupervised learning in computer vision and natural language processing, neural network architecture choices for reinforcement learning remain relatively under-explored. We take inspiration from successful architectural choices in computer vision and generative modeling, and investigate the use of deeper networks and dense connections for reinforcement learning on a variety of simulated robotic learning benchmark environments. Our findings reveal that current methods benefit significantly from dense connections and deeper networks, across a suite of manipulation and locomotion tasks, for both proprioceptive and image-based observations. We hope that our results can serve as a strong baseline and further motivate future research into neural network architectures for reinforcement learning. The project website is at this link <a href=\"https://sites.google.com/view/d2rl-anonymous/home\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/d2rl-anonymous/home</a>",
+    "title": "D2RL: Deep Dense Architectures in Reinforcement Learning",
+    "authors": [
+      "Samarth Sinha",
+      "Homanga Bharadhwaj",
+      "Aravind Srinivas",
+      "Animesh Garg"
+    ],
+    "emails": [
+      "~Aravind_Srinivas1",
+      "~Animesh_Garg1",
+      "~Homanga_Bharadhwaj1",
+      "~Samarth_Sinha1"
+    ],
+    "rank": 1739
+  },
+  {
+    "url": "https://openreview.net/forum?id=TMUR2ovJfjE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "It is well known that the complexity of a classifier's function space controls its generalization gap, with two important examples being VC-dimension and Rademacher complexity (R-Complexity). We note that these traditional generalization error bounds consider the ground truth label generating function (LGF) to be fixed. However, if we consider a scenario where the LGF has no constraints at all, then the true generalization error can be large, irrespective of training performance, as the values of the LGF on unseen data points can be largely independent of the values on the training data. To account for this, in this work, we consider an extended characterization of the problem, where the ground truth labels are generated by a function within another function space, which we call the \\textit{generator} space. We find that the generalization gap in this scenario depends on the R-Complexity of both the classifier and the generator function spaces. Thus, we find that, even if the R-Complexity of the classifier is low and it has a good training fit, a highly complex generator space could worsen generalization performance, in accordance with the no free lunch theorem. Furthermore, the characterization of a generator space allows us to model constraints, such as invariances (translation and scale in vision) or local smoothness. Subsequently, we propose a joint entropy-like measure of complexity between function spaces (classifier and generator), called co-complexity, which leads to tighter bounds on the generalization error in this setting. Co-complexity captures the similarities between the classifier and generator spaces. It can be decomposed into an invariance co-complexity term, which measures the extent to which the classifier respects the invariant transformations in the generator, and a dissociation co-complexity term, which measures the ability of the classifier to differentiate separate categories in the generator. Our major finding is that reducing the invariance co-complexity of a classifier, while maintaining its dissociation co-complexity, improves the training error and reduces the generalization gap. Furthermore, our results, when specialized to the previous setting where the LGF is fixed, lead to tighter generalization error bounds. Theoretical results are supported by empirical validation on the CNN architecture and its transformation-equivariant extensions. Co-complexity showcases a new side to the generalization abilities of classifiers and can potentially be used to improve their design.  ",
+    "title": "Co-complexity: An Extended Perspective on Generalization Error",
+    "authors": [
+      "Rohan Ghosh",
+      "Mehul Motani"
+    ],
+    "emails": [
+      "~Mehul_Motani1",
+      "~Rohan_Ghosh1"
+    ],
+    "rank": 1740
+  },
+  {
+    "url": "https://openreview.net/forum?id=g0a-XYjpQ7r",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      5,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Investigation of the degree of personalization in federated learning algorithms has shown that only maximizing the performance of the global model will confine the capacity of the local models to personalize. In this paper, we advocate an adaptive personalized federated learning (APFL) algorithm, where each client will train their local models while contributing to the global model.  We derive the generalization bound of mixture of local and global models, and find the optimal mixing parameter. We also  propose a  communication-efficient  optimization method to collaboratively learn the personalized models and analyze its convergence in both smooth strongly convex and nonconvex settings.  The extensive experiments demonstrate the effectiveness of our personalization schema, as well as the correctness of established generalization theories.\n",
+    "title": "Adaptive Personalized Federated Learning",
+    "authors": [
+      "yuyang deng",
+      "Mohammad Mahdi Kamani",
+      "Mehrdad Mahdavi"
+    ],
+    "emails": [
+      "~yuyang_deng1",
+      "~Mehrdad_Mahdavi2",
+      "~Mohammad_Mahdi_Kamani2"
+    ],
+    "rank": 1741
+  },
+  {
+    "url": "https://openreview.net/forum?id=INhwJdJtxn6",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      8
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Designing agents that acquire knowledge autonomously and use it to solve new tasks efficiently is an important challenge in reinforcement learning. Unsupervised learning provides a useful paradigm for autonomous acquisition of task-agnostic knowledge. In supervised settings, representations discovered through unsupervised pre-training offer important benefits when transferred to downstream tasks. Given the nature of the reinforcement learning problem, we explore how to transfer knowledge through behavior instead of representations. The behavior of pre-trained policies may be used for solving the task at hand (exploitation), as well as for collecting useful data to solve the problem (exploration). We argue that pre-training policies to maximize coverage will result in behavior that is useful for both strategies. When using these policies for both exploitation and exploration, our agents discover solutions that lead to larger returns. The largest gains are generally observed in domains requiring structured exploration, including settings where the behavior of the pre-trained policies is misaligned with the downstream task.",
+    "title": "Coverage as a Principle for Discovering Transferable Behavior in Reinforcement Learning",
+    "authors": [
+      "V\u00edctor Campos",
+      "Pablo Sprechmann",
+      "Steven Stenberg Hansen",
+      "Andre Barreto",
+      "Charles Blundell",
+      "Alex Vitvitskyi",
+      "Steven Kapturowski",
+      "Adria Puigdomenech Badia"
+    ],
+    "emails": [
+      "~V\u00edctor_Campos1",
+      "~Steven_Kapturowski1",
+      "~Steven_Stenberg_Hansen1",
+      "~Charles_Blundell1",
+      "~Pablo_Sprechmann1",
+      "google.com",
+      "~Adria_Puigdomenech_Badia2",
+      "~Andre_Barreto1"
+    ],
+    "rank": 1742
+  },
+  {
+    "url": "https://openreview.net/forum?id=jYkO_0z2TAr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7
+    ],
+    "rating": "5.07",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Zero-shot learning relies on semantic class representations such as hand-engineered attributes or learned embeddings to predict classes without any labeled examples. We propose to learn class representations from common sense knowledge graphs. Common sense knowledge graphs are an untapped source of explicit high-level knowledge that requires little human effort to apply to a range of tasks. To capture the knowledge in the graph, we introduce ZSL-KG, a general-purpose framework with a novel transformer graph convolutional network (TrGCN) to generate class representations. Our proposed TrGCN architecture computes non-linear combinations of the node neighbourhood and leads to significant improvements on zero-shot learning tasks. We report new state-of-the-art accuracies on six zero-shot benchmark datasets in object classification, intent classification, and fine-grained entity typing tasks. ZSL-KG outperforms the specialized state-of-the-art method for each task by an average 1.7 accuracy points and outperforms the general-purpose method with the best average accuracy by 5.3 points. Our ablation study on ZSL-KG with alternate graph neural networks shows that our transformer-based aggregator adds up to 2.8 accuracy points improvement on these tasks.",
+    "title": "Zero-Shot Learning with Common Sense Knowledge Graphs",
+    "authors": [
+      "Nihal Nayak",
+      "Stephen Bach"
+    ],
+    "emails": [
+      "~Nihal_Nayak1",
+      "~Stephen_Bach1"
+    ],
+    "rank": 1743
+  },
+  {
+    "url": "https://openreview.net/forum?id=ng0IIc1mbTu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      3,
+      7
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Element-wise activation functions play a critical role in deep neural networks via affecting the expressivity power and the learning dynamics. Learning-based activation functions have recently gained increasing attention and success. We propose a new perspective of learnable activation function through formulating them with element-wise attention mechanism. In each network layer, we devise an attention module which learns an element-wise, sign-based attention map for the pre-activation feature map. The attention map scales an element based on its sign. Adding the attention module with a rectified linear unit (ReLU) results in an amplification of positive elements and a suppression of negative ones, both with learned, data-adaptive parameters. We coin the resulting activation function Attention-based Rectified Linear Unit (AReLU). The attention module essentially learns an element-wise residue of the activated part of the input, as ReLU can be viewed as an identity transformation. This makes the network training more resis- tant to gradient vanishing. The learned attentive activation leads to well-focused activation of relevant regions of a feature map. Through extensive evaluations, we show that AReLU significantly boosts the performance of most mainstream network architectures with only two extra learnable parameters per layer introduced. Notably, AReLU facilitates fast network training under small learning rates, which makes it especially suited in the case of transfer learning and meta learning.",
+    "title": "ARELU: ATTENTION-BASED RECTIFIED LINEAR UNIT",
+    "authors": [
+      "Chen Dengsheng",
+      "Jun Li",
+      "Kai Xu"
+    ],
+    "emails": [
+      "~Jun_Li4",
+      "~Kai_Xu5",
+      "~Chen_Dengsheng3"
+    ],
+    "rank": 1744
+  },
+  {
+    "url": "https://openreview.net/forum?id=e3KNSdWFOfT",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Many recent AI architectures are inspired&nbsp;by zero-sum games, however, the behavior of their dynamics is still not well understood. Inspired by this, we study standard gradient descent ascent (GDA) dynamics in a specific class of non-convex non-concave zero-sum games, that we call hidden zero-sum games. In this class, players control the inputs of smooth but possibly non-linear functions whose outputs are being applied as inputs to a convex-concave game. Unlike general min-max games, these games have a well-defined notion of solution; outcomes that implement the von-Neumann equilibrium of the ``hidden convex-concave game. We prove that if the hidden game is strictly convex-concave then vanilla GDA converges not merely to local Nash, but typically to the von-Neumann solution. If the game lacks strict convexity properties, GDA may fail to converge to any equilibrium, however, by applying standard regularization techniques we can prove convergence to a von-Neumann solution of a slightly perturbed min-max game.&nbsp;Our convergence&nbsp;guarantees are non-local, which as far as we know is a first-of-its-kind type of result in non-convex non-concave games. Finally, we discuss connections of our framework with generative adversarial networks. \n",
+    "title": "Solving Min-Max Optimization with Hidden Structure via Gradient Descent Ascent",
+    "authors": [
+      "Emmanouil-Vasileios Vlatakis-Gkaragkounis",
+      "Lampros Flokas",
+      "Georgios Piliouras"
+    ],
+    "emails": [
+      "~Lampros_Flokas1",
+      "~Emmanouil-Vasileios_Vlatakis-Gkaragkounis1",
+      "~Georgios_Piliouras1"
+    ],
+    "rank": 1745
+  },
+  {
+    "url": "https://openreview.net/forum?id=M4qXqdw3xC",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      7,
+      3
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recent studies have shown that the addition of zero padding drives convolutional neural networks (CNNs) to encode a significant amount of absolute position information in their internal representations, while a lack of padding precludes position encoding. Additionally, various studies have used image patches on background canvases (e.g., to accommodate that inputs to CNNs must be rectangular) without consideration that different backgrounds may contain varying levels of position information according to their color. These studies give rise to deeper questions about the role of boundary information in CNNs, that are explored in this paper: (i) What boundary heuristics (e.g., padding type, canvas color) enable optimal encoding of absolute position information for a particular downstream task?; (ii) Where in the latent representations do boundary effects destroy semantic and location information?; (iii) Does encoding position information affect the learning of semantic representations?; (iv) Does encoding position information always improve performance? To provide answers to these questions, we perform the largest case study to date on the role that padding and border heuristics play in CNNs. We first show that zero padding injects optimal position information into CNNs relative to other common padding types. We then design a series of novel tasks which allow us to accurately quantify boundary effects as a function of the distance to the border. A number of semantic objectives reveal the destructive effect of dealing with the border on semantic representations. Further, we demonstrate that the encoding of position information improves separability of learned semantic features. Finally, we demonstrate the implications of these findings on a number of real-world tasks to show that position information can act as a feature or a bug.",
+    "title": "Boundary Effects in CNNs: Feature or Bug?",
+    "authors": [
+      "Md Amirul Islam",
+      "Matthew Kowal",
+      "Sen Jia",
+      "Konstantinos G. Derpanis",
+      "Neil Bruce"
+    ],
+    "emails": [
+      "~Matthew_Kowal1",
+      "~Neil_Bruce1",
+      "~Konstantinos_G._Derpanis1",
+      "~Md_Amirul_Islam1",
+      "~Sen_Jia1"
+    ],
+    "rank": 1746
+  },
+  {
+    "url": "https://openreview.net/forum?id=foNTMJHXHXC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      5,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Distributional shift is one of the major obstacles when transferring machine learning prediction systems from the lab to the real world. To tackle this problem, we assume that variation across training domains is representative of the variation we might encounter at test time, but also that shifts at test time may be more extreme in magnitude. In particular, we show that reducing differences in risk across training domains can reduce a model\u2019s sensitivity to a wide range of extreme distributional shifts, including the challenging setting where the input contains both causal and anti-causal elements. We motivate this approach, Risk Extrapolation (REx), as a form of robust optimization over a perturbation set of extrapolated domains (MM-REx), and propose a penalty on the variance of training risks (V-REx) as a simpler variant. We prove that variants of REx can recover the causal mechanisms of the targets, while also providing some robustness to changes in the input distribution (``covariate shift''). By appropriately trading-off robustness to causally induced distributional shifts and covariate shift, REx is able to outperform alternative methods such as Invariant Risk Minimization in situations where these types of shift co-occur.",
+    "title": "Out-of-Distribution Generalization via Risk Extrapolation (REx)",
+    "authors": [
+      "David Krueger",
+      "Ethan Caballero",
+      "Joern-Henrik Jacobsen",
+      "Amy Zhang",
+      "Jonathan Binas",
+      "R\u00e9mi LE PRIOL",
+      "Dinghuai Zhang",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Jonathan_Binas1",
+      "~Ethan_Caballero1",
+      "~Aaron_Courville3",
+      "~Amy_Zhang1",
+      "~R\u00e9mi_LE_PRIOL1",
+      "~Joern-Henrik_Jacobsen1",
+      "~David_Krueger1",
+      "~Dinghuai_Zhang1"
+    ],
+    "rank": 1747
+  },
+  {
+    "url": "https://openreview.net/forum?id=NgZKCRKaY3J",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      4,
+      4
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Building reliable machine learning systems requires that we correctly understand their level of confidence. Calibration focuses on measuring the degree of accuracy in a model's confidence and most research in calibration focuses on techniques to improve an empirical estimate of calibration error, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c43\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c42\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c49\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">C</mi><mi mathvariant=\"normal\">E</mi></mrow><mrow><mi mathvariant=\"normal\">B</mi><mi mathvariant=\"normal\">I</mi><mi mathvariant=\"normal\">N</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>. Using simulation, we show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c43\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c42\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c49\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">C</mi><mi mathvariant=\"normal\">E</mi></mrow><mrow><mi mathvariant=\"normal\">B</mi><mi mathvariant=\"normal\">I</mi><mi mathvariant=\"normal\">N</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> can systematically underestimate or overestimate the true calibration error depending on the nature of model miscalibration, the size of the evaluation data set, and the number of bins. Critically, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c43\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c42\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c49\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c4E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">C</mi><mi mathvariant=\"normal\">E</mi></mrow><mrow><mi mathvariant=\"normal\">B</mi><mi mathvariant=\"normal\">I</mi><mi mathvariant=\"normal\">N</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> is more strongly biased for perfectly calibrated models. We propose a simple alternative calibration error metric, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c43\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c53\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c57\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c50\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">C</mi><mi mathvariant=\"normal\">E</mi></mrow><mrow><mi mathvariant=\"normal\">S</mi><mi mathvariant=\"normal\">W</mi><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">P</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>, in which the number of bins is chosen to be as large as possible while preserving monotonicity in the calibration function. Evaluating our measure on distributions fit to neural network confidence scores on CIFAR-10, CIFAR-100, and ImageNet, we show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c43\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c53\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c57\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c45\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c50\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">C</mi><mi mathvariant=\"normal\">E</mi></mrow><mrow><mi mathvariant=\"normal\">S</mi><mi mathvariant=\"normal\">W</mi><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">E</mi><mi mathvariant=\"normal\">P</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> produces a less biased estimator of calibration error and therefore should be used by any researcher wishing to evaluate the calibration of models trained on similar datasets.",
+    "title": "Mitigating bias in calibration error estimation",
+    "authors": [
+      "Rebecca Roelofs",
+      "Nicholas Cain",
+      "Jonathon Shlens",
+      "Michael Curtis Mozer"
+    ],
+    "emails": [
+      "~Jonathon_Shlens1",
+      "~Michael_Curtis_Mozer1",
+      "~Rebecca_Roelofs1",
+      "~Nicholas_Cain1"
+    ],
+    "rank": 1748
+  },
+  {
+    "url": "https://openreview.net/forum?id=ypJS_nyu-I",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      6
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We investigate the discounting mismatch in actor-critic algorithm implementations from a representation learning perspective. Theoretically, actor-critic algorithms usually have discounting for both actor and critic, i.e., there is a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.051em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03b3</mi><mi>t</mi></msup></math></mjx-assistive-mml></mjx-container> term in the actor update for the transition observed at time <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container> in a trajectory and the critic is a discounted value function. Practitioners, however, usually ignore the discounting (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.051em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03b3</mi><mi>t</mi></msup></math></mjx-assistive-mml></mjx-container>) for the actor while using a discounted critic. We investigate this mismatch in two scenarios. In the first scenario, we consider optimizing an undiscounted objective <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mi>\u03b3</mi><mo>=</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.051em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03b3</mi><mi>t</mi></msup></math></mjx-assistive-mml></mjx-container> disappears naturally <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><msup><mn>1</mn><mi>t</mi></msup><mo>=</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. We then propose to interpret the discounting in critic in terms of a bias-variance-representation trade-off and provide supporting empirical results. In the second scenario, we consider optimizing a discounted objective (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b3</mi><mo>&lt;</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container>) and propose to interpret the omission of the discounting in the actor update from an auxiliary task perspective and provide supporting empirical results.",
+    "title": "A Deeper Look at Discounting Mismatch in Actor-Critic Algorithms",
+    "authors": [
+      "Shangtong Zhang",
+      "Romain Laroche",
+      "Harm van Seijen",
+      "Shimon Whiteson",
+      "Remi Tachet des Combes"
+    ],
+    "emails": [
+      "~Shimon_Whiteson1",
+      "~Romain_Laroche1",
+      "~Shangtong_Zhang1",
+      "~Harm_van_Seijen1",
+      "~Remi_Tachet_des_Combes1"
+    ],
+    "rank": 1749
+  },
+  {
+    "url": "https://openreview.net/forum?id=P5RQfyAmrU",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.07",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "We discover that deep ReLU neural network classifiers&nbsp;can see a low-dimensional Riemannian manifold structure on data. Such structure comes via the local data matrix, a variation of the Fisher information matrix, where the role of the model parameters is taken by the data variables. We obtain a foliation of the data domain and we show that the dataset on which the model is trained lies on a leaf, the data leaf, whose dimension is bounded by the number of&nbsp;classification labels. We validate our results with some experiments with the MNIST dataset: paths on the data leaf connect valid images, while other leaves cover noisy images.\n",
+    "title": " Model-centric data manifold: the data through the eyes of the model",
+    "authors": [
+      "Luca Grementieri",
+      "Rita Fioresi"
+    ],
+    "emails": [
+      "nextbit.it",
+      "~Rita_Fioresi1"
+    ],
+    "rank": 1750
+  },
+  {
+    "url": "https://openreview.net/forum?id=KcLlh3Qe7KU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Most computer vision datasets are composed of disconnected sets, such as images of different objects. We prove that distributions of this type of data cannot be represented with a continuous generative network without error, independent of the learning algorithm used. Disconnected datasets can be  represented in two ways: with an ensemble of networks or with a single network using a truncated latent space. We show that ensembles are more desirable than truncated distributions for several theoretical and computational reasons. We construct a regularized optimization problem that rigorously establishes the relationships between a single continuous GAN, an ensemble of GANs, conditional GANs, and Gaussian Mixture GANs. The regularization can be computed efficiently, and we show empirically that our framework has a performance sweet spot that can be found via hyperparameter tuning. The ensemble framework provides better performance than a single continuous GAN or cGAN while maintaining fewer total parameters. ",
+    "title": "Ensembles of Generative Adversarial Networks for Disconnected Data",
+    "authors": [
+      "Lorenzo Luzi",
+      "Randall Balestriero",
+      "Richard Baraniuk"
+    ],
+    "emails": [
+      "~Richard_Baraniuk1",
+      "~Randall_Balestriero1",
+      "~Lorenzo_Luzi1"
+    ],
+    "rank": 1751
+  },
+  {
+    "url": "https://openreview.net/forum?id=IKqCy8i1XL3",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.06",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The information bottleneck (IB) principle is an elegant and useful learning framework for extracting relevant information that an input feature contains about the target. The principle has been widely used in supervised and unsupervised learning. In this paper, we investigate the effectiveness of the IB framework in reinforcement learning (RL). We first derive the objective based on IB in reinforcement learning, then we analytically derive the optimal conditional distribution of the optimization problem. Following the variational information bottleneck (VIB), we provide a variational lower bound using a prior distribution. Unlike VIB, we propose to utilize the amortized Stein variational gradient method to optimize the lower bound. We incorporate this framework in two popular RL algorithms: the advantageous actor critic algorithm (A2C) and the proximal policy optimization algorithm (PPO). Our experimental results show that our framework can improve the sample efficiency of vanilla A2C and PPO. We also show that our method achieves better performance than VIB and mutual information neural estimation (MINE), two other popular approaches to optimize the information bottleneck framework in supervised learning.",
+    "title": "Optimizing Information Bottleneck in Reinforcement Learning: A Stein Variational Approach",
+    "authors": [
+      "Pei Yingjun",
+      "Hou Xinwen",
+      "Li Jian",
+      "Lei Wang"
+    ],
+    "emails": [
+      "~Li_Jian1",
+      "nlpr.ia.ac.cn",
+      "~Lei_Wang22",
+      "~Pei_Yingjun1"
+    ],
+    "rank": 1752
+  },
+  {
+    "url": "https://openreview.net/forum?id=faE-D_0d4M",
+    "ratings": [
+      8,
+      4,
+      5,
+      4
+    ],
+    "rating": "5.06",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Existing approaches to few-shot learning deal with tasks that have persistent, rigid notions of classes. Typically, the learner observes data only from a fixed number of classes at training time and is asked to generalize to a new set of classes at test time. Two examples from the same class would always be assigned the same labels in any episode. In this work, we consider a realistic setting where the relationship between examples can change from episode to episode depending on the task context, which is not given to the learner.  We define two new benchmark datasets for this flexible few-shot scenario, where the tasks are based on images of faces (Celeb-A) and shoes (Zappos50K). While classification baselines learn representations that work well for standard few-shot learning, they suffer in our flexible tasks since the classification criteria shift from training to testing. On the other hand, unsupervised contrastive representation learning with instance-based invariance objectives preserves such flexibility. A combination of instance and class invariance learning objectives is found to perform best on our new flexible few-shot learning benchmarks, and a novel variant of Prototypical Networks is proposed for selecting useful feature dimensions.",
+    "title": "Exploring representation learning for flexible few-shot tasks",
+    "authors": [
+      "Mengye Ren",
+      "Eleni Triantafillou",
+      "Kuan-Chieh Wang",
+      "James Lucas",
+      "Jake Snell",
+      "Xaq Pitkow",
+      "Andreas S. Tolias",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Mengye_Ren1",
+      "~Jake_Snell1",
+      "~James_Lucas1",
+      "~Andreas_S._Tolias1",
+      "~Xaq_Pitkow1",
+      "~Kuan-Chieh_Wang1",
+      "~Eleni_Triantafillou1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 1753
+  },
+  {
+    "url": "https://openreview.net/forum?id=FU5IpSznDKd",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.06",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "This paper is concerned with ranking many pre-trained deep neural networks (DNNs), called checkpoints, for the transfer learning to a downstream task. Thanks to the broad use of DNNs, we may easily collect hundreds of checkpoints from various sources. Which of them transfers the best to our downstream task of interest? Striving to answer this question thoroughly, we establish a neural checkpoint ranking benchmark (\\benchmark) and study some intuitive ranking measures. These measures are generic, applying to the checkpoints of different output types without knowing how the checkpoints are pre-trained on which dataset. They also incur low computation costs, making them practically meaningful. Our results suggest that the linear separability of the features extracted by the checkpoints is a strong indicator of transferability. We also arrive at a new ranking measure, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4E TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">N</mi></mrow></math></mjx-assistive-mml></mjx-container>LEEP, which gives rise to the best performance in the experiments.",
+    "title": "Ranking Neural Checkpoints",
+    "authors": [
+      "YANDONG LI",
+      "Xuhui Jia",
+      "Ruoxin Sang",
+      "Yukun Zhu",
+      "Bradley Green",
+      "Liqiang Wang",
+      "Boqing Gong"
+    ],
+    "emails": [
+      "~Bradley_Green3",
+      "~Liqiang_Wang1",
+      "~Yukun_Zhu1",
+      "~YANDONG_LI1",
+      "~Xuhui_Jia1",
+      "google.com",
+      "~Boqing_Gong1"
+    ],
+    "rank": 1754
+  },
+  {
+    "url": "https://openreview.net/forum?id=jDIWFyftpQh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.06",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "While deep learning methods have shown great success in medical image analysis, they require a number of medical images to train. Due to data privacy concerns and unavailability of medical annotators, it is oftentimes very difficult to obtain a lot of labeled medical images for model training. In this paper, we study cross-modality data augmentation to mitigate the data deficiency issue in medical imaging domain. We propose a discriminative unpaired image-to-image translation model which translate images in source modality into images in target modality where the translation task is conducted jointly with the downstream prediction task and the translation is guided by the prediction. Experiments on two applications demonstrate the effectiveness of our method. ",
+    "title": "Discriminative Cross-Modal Data Augmentation for Medical Imaging Applications",
+    "authors": [
+      "Yue Yang",
+      "Pengtao Xie"
+    ],
+    "emails": [
+      "~Yue_Yang2",
+      "~Pengtao_Xie3"
+    ],
+    "rank": 1755
+  },
+  {
+    "url": "https://openreview.net/forum?id=kXwdjtmMbUr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      8,
+      4
+    ],
+    "rating": "5.06",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We reconsider the evaluation of OOD detection methods for image recognition. Although many studies have been conducted so far to build better OOD detection methods, most of them follow Hendrycks and Gimpel's work for the method of experimental evaluation. While the unified evaluation method is necessary for a fair comparison, there is a question of if its choice of tasks and datasets reflect real-world applications and if the evaluation results can generalize to other OOD detection application scenarios. In this paper, we experimentally evaluate the performance of representative OOD detection methods for three scenarios, i.e., irrelevant input detection, novel class detection, and domain shift detection, on various datasets and classification tasks. The results show that differences in scenarios and datasets alter the relative performance among the methods. Our results can also be used as a guide for practitioners for the selection of OOD detection methods.",
+    "title": "Practical Evaluation of Out-of-Distribution Detection Methods for Image Classification",
+    "authors": [
+      "Engkarat Techapanurak",
+      "Takayuki Okatani"
+    ],
+    "emails": [
+      "~Takayuki_Okatani1",
+      "~Engkarat_Techapanurak1"
+    ],
+    "rank": 1756
+  },
+  {
+    "url": "https://openreview.net/forum?id=78SlGFxtlM",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.06",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Recent years have seen a surge of interest in meta-learning techniques for tackling the few-shot learning (FSL) problem. However, the meta-learner's initial model is prone to meta-overfit, as there are only a few available samples with sampling noise. Besides, when handling the data sampled with label noise for FSL, meta-learner could be extremely sensitive to label noise. To address these two challenges that FSL with sampling and label noise. In particular, we first cast the meta-overfitting problem (overfitting on sampling and label noise) as a gradient noise problem since few available samples cause meta-learner to overfit on existing examples (clean or corrupted) of an individual task at every gradient step. We present Eigen-Reptile (ER) that updates the meta-parameters with the main direction of historical task-specific parameters to alleviate gradient noise. Specifically, the main direction is computed by a special mechanism for the parameter's large size. Furthermore, to obtain a more accurate main direction for Eigen-Reptile in the presence of label noise, we propose Introspective Self-paced Learning (ISPL) that constructs a plurality of prior models to determine which sample should be abandoned. We have proved the effectiveness of Eigen-Reptile and ISPL, respectively, theoretically and experimentally. Moreover, our experiments on different tasks demonstrate that the proposed methods outperform or achieve highly competitive performance compared with the state-of-the-art methods with or without noisy labels.",
+    "title": "Robust Meta-learning with Noise via Eigen-Reptile",
+    "authors": [
+      "Dong Chen",
+      "Lingfei Wu",
+      "Siliang Tang",
+      "Fangli Xu",
+      "Juncheng Li",
+      "Chang Zong",
+      "Chilie Tan",
+      "Yueting Zhuang"
+    ],
+    "emails": [
+      "~Yueting_Zhuang1",
+      "~Lingfei_Wu1",
+      "yixue.us",
+      "zju.edu.cn",
+      "~Dong_Chen5",
+      "~Siliang_Tang1",
+      "tongdun.net"
+    ],
+    "rank": 1757
+  },
+  {
+    "url": "https://openreview.net/forum?id=AZ4vmLoJft",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.06",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Semantic code similarity systems are integral to a range of applications from code recommendation to automated software defect correction. Yet, these systems still lack the maturity in accuracy for general and reliable wide-scale usage. To help address this, we present Machine Inferred Code Similarity (MISIM), a novel end-to-end code similarity system that consists of two core components. First, MISIM uses a novel context-aware semantic structure (CASS), which is designed to aid in lifting semantic meaning from code syntax. We compare CASS with the abstract syntax tree (AST) and show CASS is more accurate than AST by up to 1.67x. Second, MISIM provides a neural-based code similarity scoring algorithm, which can be implemented with various neural network architectures with learned parameters. We compare MISIM to four state-of-the-art systems: (i) Aroma, (ii) code2seq, (iii) code2vec, and (iv) Neural Code Comprehension. In our experimental evaluation across 328,155 programs (over 18 million lines of code), MISIM has 1.5x to 43.4x better accuracy across all four systems.\n",
+    "title": "(Updated submission 11/20/2020) MISIM: A Novel Code Similarity System",
+    "authors": [
+      "Fangke Ye",
+      "Shengtian Zhou",
+      "Anand Venkat",
+      "Ryan Marcus",
+      "Nesime Tatbul",
+      "Jesmin Jahan Tithi",
+      "Niranjan Hasabnis",
+      "Paul Petersen",
+      "Timothy G Mattson",
+      "Tim Kraska",
+      "Pradeep Dubey",
+      "Vivek Sarkar",
+      "Justin Gottschlich"
+    ],
+    "emails": [
+      "gatech.edu",
+      "~Tim_Kraska1",
+      "~Ryan_Marcus1",
+      "~Shengtian_Zhou1",
+      "~Pradeep_Dubey1",
+      "~Nesime_Tatbul1",
+      "~Vivek_Sarkar2",
+      "intel.com",
+      "~Justin_Gottschlich1"
+    ],
+    "rank": 1758
+  },
+  {
+    "url": "https://openreview.net/forum?id=KcTBbZ1kM6K",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      4,
+      5
+    ],
+    "rating": "5.06",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "The mismatch between training dataset and target environment is one major challenge for current machine learning systems. When training data is collected from multiple environments and the the evaluation is on any new environment, we are facing an Out-of-Distribution (OOD) generalization problem that aims to find a model with the best OOD accuracy, i.e.  the  best  worst-environment  accuracy. However, with limited access to environments, the worst environment may be unseen, and test accuracy is a biased estimate of OOD accuracy.  In this paper, we show that test accuracy may dramatically fail to identify OOD accuracy and mislead the tuning procedure. To this end, we introduce Influence Function, a classical tool from robust statistics, into the OOD generalization problem and suggest the variance of influence function to measure the stability of a model on training environments.  We show that the proposed index and test accuracy together can help us discern whether OOD algorithms are needed and whether a model achieves good OOD generalization.",
+    "title": "Out-of-Distribution Generalization Analysis via Influence Function",
+    "authors": [
+      "Haotian Ye",
+      "Chuanlong Xie",
+      "Yue Liu",
+      "Zhenguo Li"
+    ],
+    "emails": [
+      "~Chuanlong_Xie1",
+      "huawei.com",
+      "~Zhenguo_Li1",
+      "pku.edu.cn"
+    ],
+    "rank": 1759
+  },
+  {
+    "url": "https://openreview.net/forum?id=tqc8n6oHCtZ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.06",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Although transformers have achieved impressive accuracies in various tasks in natural language processing, they often come with a prohibitive computational cost, that prevents their use in scenarios with limited computational resources for inference. This need for computational efficiency in inference has been addressed by for instance PoWER-BERT (Goyal et al., 2020) which gradually decreases the length of a sequence as it is passed through layers. These approaches however often assume that the target computational complexity is known in advance at the time of training. This implies that a separate model must be trained for each inference scenario with its distinct computational budget. In this paper, we extend PoWER-BERT to address this issue of inefficiency and redundancy. The proposed extension enables us to train a large-scale transformer, called Length-Adaptive Transformer, once and uses it for various inference scenarios without re-training it. To do so, we train a transformer with LengthDrop, a structural variant of dropout, which stochastically determines the length of a sequence at each layer. We then use a multi-objective evolutionary search to find a length configuration that maximizes the accuracy and minimizes the computational complexity under any given computational budget. Additionally, we significantly extend the applicability of PoWER-BERT beyond sequence-level classification into token-level classification such as span-based question-answering, by introducing the idea of Drop-and-Restore. With Drop-and-Restore, word-vectors are dropped temporarily in intermediate layers and restored at the last layer if necessary. We empirically verify the utility of the proposed approach by demonstrating the superior accuracy-efficiency trade-off under various setups, including SQuAD 1.1, MNLI-m, and SST-2. Upon publication, the code to reproduce our work will be open-sourced.\n",
+    "title": "Length-Adaptive Transformer: Train Once with Length Drop, Use Anytime with Search",
+    "authors": [
+      "Gyuwan Kim",
+      "Kyunghyun Cho"
+    ],
+    "emails": [
+      "~Gyuwan_Kim1",
+      "~Kyunghyun_Cho1"
+    ],
+    "rank": 1760
+  },
+  {
+    "url": "https://openreview.net/forum?id=lbc44k2jgnX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.06",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Langevin Monte Carlo (LMC) is a popular Markov chain Monte Carlo sampling method. One drawback is that it requires the computation of the full gradient at each iteration, an expensive operation if the dimension of the problem is high. We propose a new sampling method: Random Coordinate LMC (RC-LMC). At each iteration, a single coordinate is randomly selected to be updated by a multiple of the partial derivative along this direction plus noise, and all other coordinates remain untouched. We investigate the total complexity of RC-LMC and compare it with the classical LMC for log-concave probability distributions. When the gradient of the log-density is Lipschitz, RC-LMC is less expensive than the classical LMC if the log-density is highly skewed for high dimensional problems, and when both the gradient and the Hessian of the log-density are Lipschitz, RC-LMC is always cheaper than the classical LMC, by a factor proportional to the square root of the problem dimension. In the latter case, our estimate of complexity is sharp with respect to the dimension. ",
+    "title": "Random Coordinate Langevin Monte Carlo",
+    "authors": [
+      "Zhiyan Ding",
+      "Qin Li",
+      "Jianfeng Lu",
+      "Stephen Wright"
+    ],
+    "emails": [
+      "~Jianfeng_Lu1",
+      "~Stephen_Wright1",
+      "~Qin_Li1",
+      "~Zhiyan_Ding1"
+    ],
+    "rank": 1761
+  },
+  {
+    "url": "https://openreview.net/forum?id=uhiF-dV99ir",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.06",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Training artificial neural networks requires the optimization of highly non-convex loss functions. Throughout the years, the scientific community has developed an extensive set of tools and architectures that render this optimization task tractable and a general intuition has been developed for choosing hyper parameters that help the models reach minima that generalize well to unseen data. However, for the most part, the difference in trainability in between architectures, tasks and even the gap in network generalization abilities still remain unexplained. Visualization tools have played a key role in uncovering key geometric characteristics of the loss-landscape of ANNs and how they impact trainability and generalization capabilities. However, most visualizations methods proposed so far have been relatively limited in their capabilities since they are of linear nature and only capture features in a limited number of dimensions. We propose the use of the modern dimensionality reduction method PHATE which represents the SOTA in terms of capturing both global and local structures of high-dimensional data. We apply this method to visualize the loss landscape during and after training. Our visualizations reveal differences in training trajectories and generalization capabilities when used to make comparisons between optimization methods, initializations, architectures, and datasets. Given this success we anticipate this method to be used in making informed choices about these aspects of neural networks.",
+    "title": "Visualizing High-Dimensional Trajectories on the Loss-Landscape of ANNs",
+    "authors": [
+      "Stefan Horoi",
+      "Jessie Huang",
+      "Guy Wolf",
+      "Smita Krishnaswamy"
+    ],
+    "emails": [
+      "~Stefan_Horoi1",
+      "~Smita_Krishnaswamy1",
+      "yale.edu",
+      "~Guy_Wolf1"
+    ],
+    "rank": 1762
+  },
+  {
+    "url": "https://openreview.net/forum?id=4SZ9Ft--pDl",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      4,
+      4,
+      6
+    ],
+    "rating": "5.05",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "While Bayesian Optimization (BO) is a very popular method for optimizing expensive black-box functions, it fails to leverage the experience of domain experts. This causes BO to waste function evaluations on bad design choices (e.g., machine learning hyperparameters) that the expert already knows to work poorly.  To address this issue, we introduce Prior-guided Bayesian Optimization (PrBO). PrBO allows users to inject their knowledge into the optimization process in the form of priors about which parts of the input space will yield the best performance, rather than BO\u2019s standard priors over functions (which are much less intuitive for users). PrBO then combines these priors with BO\u2019s standard probabilistic model to form a pseudo-posterior used to select which points to evaluate next. We show that PrBO is around 12x faster than state-of-the-art methods without user priors and 10,000x faster than random search on a common suite of benchmarks, and achieves a new state-of-the-art performance on a real-world hardware design application. We also show that PrBO converges faster even if the user priors are not entirely accurate and that it robustly recovers from misleading priors.",
+    "title": "Prior-guided Bayesian Optimization",
+    "authors": [
+      "Artur Souza",
+      "Luigi Nardi",
+      "Leonardo Oliveira",
+      "Kunle Olukotun",
+      "Marius Lindauer",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "~Kunle_Olukotun1",
+      "~Frank_Hutter1",
+      "~Artur_Souza1",
+      "~Luigi_Nardi1",
+      "~Marius_Lindauer1",
+      "~Leonardo_Oliveira1"
+    ],
+    "rank": 1763
+  },
+  {
+    "url": "https://openreview.net/forum?id=6lH8nkwKRXV",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      3,
+      4,
+      6
+    ],
+    "rating": "5.05",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks have proven to be very efficient to solve several tasks in graphs such as node classification or link prediction. These algorithms that operate by propagating information from vertices to their neighbors allow one to build node embeddings that contain local information. In order to use graph neural networks for graph classification, node embeddings must be aggregated to obtain a graph representation able to discriminate among different graphs (of possibly various sizes). Moreover, in analogy to neural networks for image classification, there is a need for explainability regarding the features that are selected in the graph classification process. To this end, we introduce StructAgg, a simple yet effective aggregation process based on the identification of structural roles for nodes in graphs that we use to create an end-to-end model. Through extensive experiments we show that this architecture can compete with state-of-the-art methods. We show how this aggregation step allows us to cluster together nodes that have comparable structural roles and how these roles provide explainability to this neural network model.\n",
+    "title": "Graph Structural Aggregation for Explainable Learning",
+    "authors": [
+      "Alexis Galland",
+      "marc lelarge"
+    ],
+    "emails": [
+      "~Alexis_Galland1",
+      "~marc_lelarge1"
+    ],
+    "rank": 1764
+  },
+  {
+    "url": "https://openreview.net/forum?id=txC1ObHJ0wB",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Weight sharing promises to make neural architecture search (NAS) tractable even on commodity hardware. Existing methods in this space rely on a diverse set of heuristics to design and train the shared-weight backbone network, a.k.a. the super-net. Since heuristics substantially vary across different methods and have not been carefully studied, it is unclear to which extent they impact super-net training and hence the weight-sharing NAS algorithms. In this paper, we disentangle super-net training from the search algorithm, isolate 14 frequently-used training heuristics, and evaluate them over three benchmark search spaces. Our analysis uncovers that several commonly-used heuristics negatively impact the correlation between super-net and stand-alone performance, whereas simple, but often overlooked factors, such as proper hyper-parameter settings, are key to achieve strong performance. Equipped with this knowledge, we show that simple random search achieves competitive performance to complex state-of-the-art NAS algorithms when the super-net is properly trained.\n",
+    "title": "How to Train Your Super-Net: An Analysis of Training Heuristics in Weight-Sharing NAS",
+    "authors": [
+      "Kaicheng Yu",
+      "Rene Ranftl",
+      "Mathieu Salzmann"
+    ],
+    "emails": [
+      "~Kaicheng_Yu1",
+      "~Mathieu_Salzmann1",
+      "~Rene_Ranftl1"
+    ],
+    "rank": 1765
+  },
+  {
+    "url": "https://openreview.net/forum?id=uSYfytRBh-f",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      8
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Image segmentation lays the foundation for many high-stakes vision applications such as autonomous driving and medical image analysis. It is, therefore, of great importance to not only improve the accuracy of segmentation models on well-established benchmarks, but also enhance their robustness in the real world so as to avoid sparse but fatal failures. In this paper, instead of chasing state-of-the-art performance on existing benchmarks, we turn our attention to a new challenging problem: how to efficiently expose failures of ``top-performing'' segmentation models in the real world and how to leverage such counterexamples to rectify the models. To achieve this with minimal human labelling effort, we first automatically sample a small set of images that are likely to falsify the target model from a large corpus of web images via the maximum discrepancy competition principle. We then propose a weakly labelling strategy to further reduce the number of false positives, before time-consuming pixel-level labelling by humans. Finally, we fine-tune the model to harness the identified failures, and repeat the whole process, resulting in an efficient and progressive framework for troubleshooting segmentation models. We demonstrate the feasibility of our framework using the semantic segmentation task in PASCAL VOC, and find that the fine-tuned model exhibits significantly improved generalization when applied to real-world images with greater content diversity. All experimental codes will be publicly released upon acceptance.",
+    "title": "Efficiently Troubleshooting Image Segmentation Models with Human-In-The-Loop",
+    "authors": [
+      "Haotao Wang",
+      "Tianlong Chen",
+      "Zhangyang Wang",
+      "Kede Ma"
+    ],
+    "emails": [
+      "~Zhangyang_Wang1",
+      "~Kede_Ma2",
+      "~Haotao_Wang1",
+      "~Tianlong_Chen1"
+    ],
+    "rank": 1766
+  },
+  {
+    "url": "https://openreview.net/forum?id=zKg145rDe8D",
+    "ratings": [
+      6,
+      6,
+      3,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Gradient-based attribution methods can aid in the understanding of convolutional neural networks (CNNs). However, the redundancy of attribution features and the gradient saturation problem, which weaken the ability to identify signi\ufb01cant features and cause the explanation focus shift, remain open challenges. In this work, we propose 1) an essential characteristic, Relevant, when selecting attribution features; 2) a new concept, feature map importance (FMI), to re\ufb01ne the contribution of each feature map, which is faithful to the CNN model; and 3) a novel attribution method via FMI, termed A-FMI, to address the gradient saturation problem, which couples the target image with a reference image, and assigns the FMI to the \u201cdifference-from-reference\u201d at the granularity of feature map. Through visual inspections and qualitative evaluations of common models on the ImageNet dataset, we show the compelling advantages of A-FMI on faithfulness, insensitivity to the choice of reference, class discriminability, and the superior explanation performance compared with popular attribution methods.",
+    "title": "A-FMI: Learning Attributions from Deep Networks via Feature Map Importance",
+    "authors": [
+      "An Zhang",
+      "Xiang Wang",
+      "Chengfang Fang",
+      "Jie Shi",
+      "Xiangnan He",
+      "Tat-seng Chua",
+      "Zehua Chen"
+    ],
+    "emails": [
+      "~Tat-seng_Chua1",
+      "~Xiangnan_He1",
+      "~Xiang_Wang6",
+      "~An_Zhang2",
+      "huawei.com",
+      "nus.edu.sg"
+    ],
+    "rank": 1767
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oe2XI-Aft-k",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Despite the recent advances in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">adversarial training</mtext></math></mjx-assistive-mml></mjx-container> based defenses, deep neural networks are still vulnerable to adversarial attacks outside the perturbation type they are trained to be robust against. Recent works have proposed defenses to improve the robustness of a single model against the union of multiple perturbation types. However, when evaluating the model against each individual attack, these methods still suffer significant trade-offs compared to the ones specifically trained to be robust against that perturbation type. In this work, we introduce the problem of categorizing adversarial examples based on their <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> perturbation types. Based on our analysis, we propose <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">PROTECTOR</mtext></math></mjx-assistive-mml></mjx-container>, a two-stage pipeline to improve the robustness against multiple perturbation types. Instead of training a single predictor, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">PROTECTOR</mtext></math></mjx-assistive-mml></mjx-container> first categorizes the perturbation type of the input, and then utilizes a predictor specifically trained against the predicted perturbation type to make the final prediction. We first theoretically show that adversarial examples created by different perturbation types constitute different distributions, which makes it possible to distinguish them. Further, we show that at test time the adversary faces a natural trade-off between fooling the perturbation type classifier and the succeeding predictor optimized with perturbation specific adversarial training. This makes it challenging for an adversary to plant strong attacks against the whole pipeline. In addition, we demonstrate the realization of this trade-off in deep networks by adding random noise to the model input at test time, enabling enhanced robustness  against strong adaptive attacks. Extensive experiments on MNIST and CIFAR-10 show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D438 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">PROTECTOR</mtext></math></mjx-assistive-mml></mjx-container> outperforms prior adversarial training based defenses by over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>, when tested against the union of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub><mo>,</mo><msub><mi>\u2113</mi><mn>2</mn></msub><mo>,</mo><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> attacks.",
+    "title": "Perturbation Type Categorization for Multiple \u2113p Bounded Adversarial Robustness",
+    "authors": [
+      "Pratyush Maini",
+      "Xinyun Chen",
+      "Bo Li",
+      "Dawn Song"
+    ],
+    "emails": [
+      "~Dawn_Song1",
+      "~Pratyush_Maini1",
+      "~Bo_Li19",
+      "~Xinyun_Chen1"
+    ],
+    "rank": 1768
+  },
+  {
+    "url": "https://openreview.net/forum?id=XkI_ggnfLZ4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "A common paradigm in model pruning is to train a model, prune, and then either fine-tune or, in the lottery ticket framework, reinitialize and retrain. Prior work has implicitly assumed that the best training configuration for model evaluation is also the best configuration for mask discovery. However, what if a training configuration which yields worse performance actually yields a mask which trains to higher performance?  To test this, we decoupled the hyperparameters for mask discovery (H_find) and mask evaluation (H_eval). Using unstructured magnitude pruning on vision classification tasks, we discovered the \"decoupled find-eval phenomenon,\" in which certain H_find values lead to models which have lower performance, but generate masks with substantially higher eventual performance compared to using the same hyperparameters for both stages. We show that this phenomenon holds across a number of models, datasets, configurations, and also for one-shot structured pruning. Finally, we demonstrate that different H_find values yield masks with materially different layerwise pruning ratios and that the decoupled find-eval phenomenon is causally mediated by these ratios. Our results demonstrate the practical utility of decoupling hyperparameters and provide clear insights into the mechanisms underlying this counterintuitive effect. ",
+    "title": "Uncovering the impact of hyperparameters for global magnitude pruning",
+    "authors": [
+      "Janice Lan",
+      "Rudy Chin",
+      "Alexei Baevski",
+      "Ari S. Morcos"
+    ],
+    "emails": [
+      "~Rudy_Chin2",
+      "~Ari_S._Morcos1",
+      "~Janice_Lan1",
+      "~Alexei_Baevski1"
+    ],
+    "rank": 1769
+  },
+  {
+    "url": "https://openreview.net/forum?id=fm58XfadSTF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": " Sentiment analysis is a costly yet necessary task for enterprises to study the opinions of their costumers to improve their products and services and to determine optimal   marketing strategies. Due to existence of a wide range of domains across different products and services, cross-domain sentiment analysis methods have received significant attention in recent years. These methods mitigate the domain gap between different applications by training cross-domain generalizable classifiers which help to relax the need for individual data annotation per each domain. Most existing methods focus on learning domain-agnostic representations that are invariant with respect to both the source and the target domains. As a result, a classifier that is trained using  annotated data in a source domain, would generalize well in a related target domain. In this work, we introduce a new domain adaptation method which induces large margins between different classes in an embedding space based on the notion of prototypical distribution. This embedding space is trained to be domain-agnostic by matching the data distributions across the domains. Large margins in the source domain help to reduce the effect of ``domain shift'' on the performance of a trained classifier in the target domain. Theoreticaland empirical analysis are provided to demonstrate that the method is effective. ",
+    "title": "Learning a Max-Margin Classifier for Cross-Domain Sentiment Analysis",
+    "authors": [
+      "Mohammad Rostami",
+      "Aram Galstyan"
+    ],
+    "emails": [
+      "~Mohammad_Rostami1",
+      "~Aram_Galstyan1"
+    ],
+    "rank": 1770
+  },
+  {
+    "url": "https://openreview.net/forum?id=QSMvGB5j5-",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      2
+    ],
+    "abstract": "Dynamic graphs are rife with higher-order interactions, such as co-authorship relationships and protein-protein interactions in biological networks, that naturally arise between more than two nodes at once. In spite of the ubiquitous presence of such higher-order interactions, limited attention has been paid to the higher-order counterpart of the popular pairwise link prediction problem. Existing higher-order structure prediction methods are mostly based on heuristic feature extraction procedures, which work well in practice but lack theoretical guarantees. Such heuristics are primarily focused on predicting links in a static snapshot of the graph. Moreover, these heuristic-based methods fail to effectively utilize and benefit from the knowledge of latent substructures already present within the higher-order structures. In this paper, we overcome these obstacles by capturing higher-order interactions succinctly as simplices, model their neighborhood by face-vectors, and develop a nonparametric kernel estimator for simplices that views the evolving graph from the perspective of a time process (i.e., a sequence of graph snapshots). Our method substantially outperforms several baseline higher-order prediction methods. As a theoretical achievement, we prove the consistency and asymptotic normality in terms of Wasserstein distance of our estimator using Stein's method.",
+    "title": "Higher-order Structure Prediction in Evolving Graph Simplicial Complexes",
+    "authors": [
+      "Manohar Kaul",
+      "Masaaki Imaizumi"
+    ],
+    "emails": [
+      "~Masaaki_Imaizumi1",
+      "~Manohar_Kaul1"
+    ],
+    "rank": 1771
+  },
+  {
+    "url": "https://openreview.net/forum?id=HdX654Yn81",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Variational Autoencoder (VAE) based frameworks have achieved the state-of-the-art performance on the unsupervised disentangled representation learning. A recent theoretical analysis shows that such success is mainly due to the VAE implementation choices that encourage a PCA-like behavior locally on data samples. Despite this implied model identifiability, the VAE based disentanglement frameworks still face the trade-off between the local orthogonality and data reconstruction. As a result, models with the same architecture and hyperparameter setting can sometime learn entangled representations. To address this challenge, we propose a simple yet effective VAE ensemble framework consisting of multiple VAEs. It is based on the assumption that entangled representations are unique in their own ways, and the disentangled representations are \"alike\" (similar up to a signed permutation transformation). In the proposed VAE ensemble, each model not only maintains its original objective, but also encodes to and decodes from other models through pair-wise linear transformations between the latent representations. We show both theoretically and experimentally, the VAE ensemble objective encourages the linear transformations connecting the VAEs to be trivial transformations, aligning the latent representations of different models to be \"alike\". We compare our approach with the state-of-the-art unsupervised disentangled representation learning approaches and show the improved performance.",
+    "title": "Improving the Unsupervised Disentangled Representation Learning with VAE Ensemble",
+    "authors": [
+      "Nanxiang Li",
+      "Shabnam Ghaffarzadegan",
+      "Liu Ren"
+    ],
+    "emails": [
+      "~Shabnam_Ghaffarzadegan1",
+      "~Liu_Ren1",
+      "~Nanxiang_Li1"
+    ],
+    "rank": 1772
+  },
+  {
+    "url": "https://openreview.net/forum?id=9CG8RW_p3Y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      3,
+      1
+    ],
+    "abstract": "Many machine learning applications involve learning representations that achieve two competing goals: To maximize information or accuracy with respect to a target while simultaneously maximizing invariance or independence with respect to a subset of features. Typical examples include privacy-preserving learning, domain adaptation, and algorithmic fairness, just to name a few. In fact, all of the above problems admit a common minimax game-theoretic formulation, whose equilibrium represents a fundamental tradeoff between accuracy and invariance. In this paper, we provide an information-theoretic analysis of this general and important problem under both classification and regression settings. In both cases, we analyze the inherent tradeoffs between accuracy and invariance by providing a geometric characterization of the feasible region in the information plane, where we connect the geometric properties of this feasible region to the fundamental limitations of the tradeoff problem. In the regression setting, we also derive a tight lower bound on the Lagrangian objective that quantifies the tradeoff between accuracy and invariance. Our results shed new light on this fundamental problem by providing insights on the interplay between accuracy and invariance. These results deepen our understanding of this fundamental problem and may be useful in guiding the design of adversarial representation learning algorithms.\n",
+    "title": "Fundamental Limits and Tradeoffs in Invariant Representation Learning",
+    "authors": [
+      "Han Zhao",
+      "Chen Dan",
+      "Bryon Aragam",
+      "Tommi S. Jaakkola",
+      "Geoff Gordon",
+      "Pradeep Kumar Ravikumar"
+    ],
+    "emails": [
+      "~Han_Zhao1",
+      "~Chen_Dan1",
+      "~Bryon_Aragam1",
+      "~Pradeep_Kumar_Ravikumar1",
+      "~Geoff_Gordon2",
+      "~Tommi_S._Jaakkola1"
+    ],
+    "rank": 1773
+  },
+  {
+    "url": "https://openreview.net/forum?id=hx0D7wn6qIy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "We propose a novel semi-supervised learning (SSL) method that adopts selective training with pseudo labels. In our method, we generate hard pseudo-labels and also estimate their confidence, which represents how likely each pseudo-label is to be correct. Then, we explicitly select which pseudo-labeled data should be used to update the model. Specifically, assuming that loss on incorrectly pseudo-labeled data sensitively increase against data augmentation, we select the data corresponding to relatively small loss after applying data augmentation. The confidence is used not only for screening candidates of pseudo-labeled data to be selected but also for automatically deciding how many pseudo-labeled data should be selected within a mini-batch. Since accurate estimation of the confidence is crucial in our method, we also propose a new data augmentation method, called MixConf, that enables us to obtain confidence-calibrated models even when the number of training data is small. Experimental results with several benchmark datasets validate the advantage of our SSL method as well as MixConf.",
+    "title": "Semi-supervised learning by selective training with pseudo labels via confidence estimation",
+    "authors": [
+      "Masato Ishii"
+    ],
+    "emails": [
+      "~Masato_Ishii1"
+    ],
+    "rank": 1774
+  },
+  {
+    "url": "https://openreview.net/forum?id=I3xhgVtNC5t",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Wasserstein distributionally robust optimization (DRO) has recently received significant attention in machine learning due to its connection to generalization, robustness and regularization. Existing methods only consider a limited class of loss functions or apply to small values of robustness. In this paper, we present a three-player game framework for solving Wasserstein DRO problem with arbitrary level of robustness, which can handle general loss functions. Specifically, we formulate a min-max game between three players who optimize over probability measures, model parameters and Lagrange multipliers. We also propose new algorithms for finding an equilibrium of the game in convex and non-convex settings which both enjoy provable convergence guarantees. Furthermore, we prove an excess risk bound for the proposed algorithms which shows that the solution returned by the algorithms closely achieves the optimal minimax risk.  ",
+    "title": "Wasserstein Distributionally Robust Optimization: A Three-Player Game Framework",
+    "authors": [
+      "Zhuozhuo Tu",
+      "Shan You",
+      "Tao Huang",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Shan_You3",
+      "~Tao_Huang5",
+      "~Dacheng_Tao1",
+      "~Zhuozhuo_Tu1"
+    ],
+    "rank": 1775
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ggx8fbKZ1-D",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Adaptive learning rates can lead to faster convergence and better final performance\nfor deep learning models. There are several widely known human-designed adap-\ntive optimizers such as Adam and RMSProp,  gradient based adaptive methods\nsuch as hyper-descent and L4, and meta learning approaches including learning\nto learn. However, the issue of balancing adaptiveness and over-parameterization\nis still a topic to be addressed.   In this study,  we investigate different levels of\nlearning rate adaptation based on the framework of hyper-gradient descent, and\nfurther propose a method that adaptively learns the model parameters for combin-\ning different levels of adaptations. Meanwhile, we show the relationship between\nadding regularization on over-parameterized learning rates and building combi-\nnations of different levels of adaptive learning rates.  The experiments on several\nnetwork architectures including feed-forward networks, LeNet-5 and ResNet-18/34\nshow  that  the  proposed  multi-level  adaptive  approach  can  outperform  baseline\nadaptive methods in a variety circumstances with statistical significance.",
+    "title": "Adaptive Hierarchical Hyper-gradient Descent",
+    "authors": [
+      "RENLONG JIE",
+      "Junbin Gao",
+      "Andrey Vasnev",
+      "Minh-Ngoc Tran"
+    ],
+    "emails": [
+      "~Minh-Ngoc_Tran1",
+      "~RENLONG_JIE1",
+      "sydney.edu.au",
+      "~Junbin_Gao1"
+    ],
+    "rank": 1776
+  },
+  {
+    "url": "https://openreview.net/forum?id=1OCwJdJSnSA",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "The domain adaptation problem involves learning a unique classification or regression model capable of performing on both a source and a target domain. Although the labels for the source data are available during training, the labels in the target domain are unknown. An effective way to tackle this problem lies in extracting insightful features invariant to the source and target domains. In this work, we propose splitting the information for each domain into a task-related representation and its complimentary context representation. We propose an original method to disentangle these two representations in the single-domain supervised case. We then adapt this method to the unsupervised domain adaptation problem. In particular, our method allows disentanglement in the target domain, despite the absence of training labels. This enables the isolation of task-specific information from both domains and a projection into a common representation. The task-specific representation allows efficient transfer of knowledge acquired from the source domain to the target domain. We validate the proposed method on several classical domain adaptation benchmarks and illustrate the benefits of disentanglement for domain adaptation.",
+    "title": "Disentangled cyclic reconstruction for domain adaptation",
+    "authors": [
+      "David Bertoin",
+      "Emmanuel Rachelson"
+    ],
+    "emails": [
+      "~David_Bertoin1",
+      "~Emmanuel_Rachelson1"
+    ],
+    "rank": 1777
+  },
+  {
+    "url": "https://openreview.net/forum?id=IUYthV32lbK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Recent studies show that deep neural networks (DNN) are vulnerable to adversarial examples, which aim to mislead DNNs to make arbitrarily incorrect predictions. To defend against such attacks, both empirical and theoretical defense approaches have been proposed for a single ML model. In this work, we aim to explore and characterize the robustness conditions for ensemble ML models. We prove that the diversified gradient and large confidence margin are sufficient and necessary conditions for certifiably robust ensemble models under the model-smoothness assumption. We also show that an ensemble model can achieve higher certified robustness than a single base model based on these conditions. To our best knowledge, this is the first work providing tight conditions for the ensemble robustness. Inspired by our analysis, we propose the lightweight Diversity Regularized Training (DRT) for ensemble models. We derive the certified robustness of DRT based ensembles such as standard Weighted Ensemble and Max-Margin Ensemble following the sufficient and necessary conditions. Besides, to efficiently calculate the model-smoothness, we leverage adapted randomized model smoothing to obtain the certified robustness for different ensembles in practice. We show that the certified robustness of ensembles, on the other hand, verifies the necessity of DRT. To compare different ensembles, we prove that when the adversarial transferability among base models is high, Max-Margin Ensemble can achieve higher certified robustness than Weighted Ensemble; vice versa. Extensive experiments show that ensemble models trained with DRT can achieve the state-of-the-art certified robustness under various settings. Our work will shed light on future analysis for robust ensemble models. ",
+    "title": "On the Certified Robustness for Ensemble Models and Beyond",
+    "authors": [
+      "Zhuolin Yang",
+      "Linyi Li",
+      "Xiaojun Xu",
+      "Bhavya Kailkhura",
+      "Bo Li"
+    ],
+    "emails": [
+      "~Linyi_Li1",
+      "~Xiaojun_Xu1",
+      "~Bo_Li19",
+      "~Zhuolin_Yang1",
+      "~Bhavya_Kailkhura1"
+    ],
+    "rank": 1778
+  },
+  {
+    "url": "https://openreview.net/forum?id=pHkBwAaZ3UK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Different nodes in a graph neighborhood generally yield different importance. In previous work of Graph Convolutional Networks (GCNs), such differences are typically modeled with attention mechanisms. However, as we prove in our paper, soft attention weights suffer from over-smoothness in large neighborhoods. To address this weakness, we introduce a novel framework of conducting graph convolutions, where nodes are discretely selected among multi-hop neighborhoods to construct adaptive receptive fields (ARFs). ARFs enable GCNs to get rid of the over-smoothness of soft attention weights, as well as to efficiently explore long-distance dependencies in graphs. We further propose GRARF (GCN with Reinforced Adaptive Receptive Fields) as an instance, where an optimal policy of constructing ARFs is learned with reinforcement learning. GRARF achieves or matches state-of-the-art performances on public datasets from different domains. Our further analysis corroborates that GRARF is more robust than attention models against neighborhood noises.",
+    "title": "Learning Discrete Adaptive Receptive Fields for Graph Convolutional Networks",
+    "authors": [
+      "Xiaojun Ma",
+      "Ziyao Li",
+      "Lingjun Xu",
+      "Guojie Song",
+      "Yi Li",
+      "Chuan Shi"
+    ],
+    "emails": [
+      "~Guojie_Song1",
+      "~Ziyao_Li1",
+      "bupt.edu.cn",
+      "pku.edu.cn"
+    ],
+    "rank": 1779
+  },
+  {
+    "url": "https://openreview.net/forum?id=v16dIb3Ud9t",
+    "ratings": [
+      5,
+      4,
+      4,
+      7
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Efficient exploration is a long-standing problem in reinforcement learning. In this work, we introduce a self-supervised exploration policy by incentivizing the agent to maximize multisensory incongruity, which can be measured in two aspects: perception incongruity and action incongruity. The former represents the uncertainty in multisensory fusion model, while the latter represents the uncertainty in an agent's policy. Specifically, an alignment predictor is trained to detect whether multiple sensory inputs are aligned, the error of which is used to measure perception incongruity. The policy takes the multisensory observations with sensory-wise dropout as input, and outputs actions for exploration. The variance of actions is further used to measure action incongruity. Our formulation allows the agent to learn skills by exploring in a self-supervised manner without any external rewards. Besides, our method enables the agent to learn a compact multimodal representation from hard examples, which further improves the sample efficiency of our policy learning. We demonstrate the efficacy of this formulation across a variety of benchmark environments including object manipulation and audio-visual games. ",
+    "title": "SEMI: Self-supervised Exploration via Multisensory Incongruity",
+    "authors": [
+      "Jianren Wang",
+      "Ziwen Zhuang",
+      "Hang Zhao"
+    ],
+    "emails": [
+      "~Hang_Zhao1",
+      "~Jianren_Wang2",
+      "~Ziwen_Zhuang1"
+    ],
+    "rank": 1780
+  },
+  {
+    "url": "https://openreview.net/forum?id=mb2L9vL-MjI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "A  numerical and phenomenological study of the gradient descent (GD) algorithm for training two-layer neural network models is carried out for different parameter regimes. It is found that there are two distinctive phases in the GD dynamics in the under-parameterized regime: An early phase in which the GD dynamics follows closely that of the corresponding random feature model, followed by a late phase in which the neurons are divided into two groups: a group of a few (maybe none) \u201cactivated\u201d neurons that dominate the dynamics and a group of ``quenched\u201d neurons that support the continued activation and deactivation process.  In particular, when the target function can be accurately approximated by a relatively small number of neurons, this quenching-activation process biases GD to picking sparse solutions.  This neural network-like behavior is continued into the mildly over-parameterized regime, in which it undergoes a transition to a random feature-like behavior where the inner-layer parameters are effectively frozen during the training process.  The quenching process seems to provide a clear mechanism for  ``implicit regularization''. This is qualitatively different from the GD dynamics associated with the ``mean-field'' scaling where all neurons participate equally.",
+    "title": "The Quenching-Activation Behavior of the Gradient Descent Dynamics for Two-layer Neural Network Models",
+    "authors": [
+      "Chao Ma",
+      "Lei Wu",
+      "Weinan E"
+    ],
+    "emails": [
+      "~Weinan_E1",
+      "~Lei_Wu1",
+      "~Chao_Ma8"
+    ],
+    "rank": 1781
+  },
+  {
+    "url": "https://openreview.net/forum?id=ryUprTOv7q0",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We develop a new quantum neural network layer designed to run efficiently on a quantum computer but that can be simulated on a classical computer when restricted in the way it entangles input states. We first ask how a classical neural network architecture, both fully connected or convolutional, can be executed on a quantum computer using quantum phase estimation. We then deform the classical layer into a quantum design which entangles activations and weights into quantum superpositions. While the full model would need the exponential speedups delivered by a quantum computer, a restricted class of designs represent interesting new classical network layers that still use quantum features. We show that these quantum deformed neural networks can be trained and executed on normal data such as images, and even classically deliver modest improvements over standard architectures.  ",
+    "title": "Quantum Deformed Neural Networks",
+    "authors": [
+      "Roberto Bondesan",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Roberto_Bondesan1",
+      "~Max_Welling1"
+    ],
+    "rank": 1782
+  },
+  {
+    "url": "https://openreview.net/forum?id=rEaz5uTcL6Q",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transformer-based language models have proved capable of rudimentary symbolic reasoning, underlining the effectiveness of  applying self-attention computations to sets of discrete entities. In this work, we apply this lesson to videos of physical interaction between objects. We show that self-attention-based models operating on discrete, learned, object-centric representations perform well on spatio-temporal reasoning tasks which were expressly designed to trouble traditional neural network models and to require higher-level cognitive processes such as causal reasoning and understanding of intuitive physics and narrative structure. We achieve state of the art results on two datasets, CLEVRER and CATER, significantly outperforming leading hybrid neuro-symbolic models. Moreover, we find that techniques from language modelling, such as BERT-style semi-supervised predictive losses, allow our model to surpass neuro-symbolic approaches while using 40% less labelled data. Our results corroborate the idea that neural networks can reason about the causal, dynamic structure of visual data and attain understanding of intuitive physics, which counters the popular claim that they are only effective at perceptual pattern-recognition and not reasoning per se.",
+    "title": "Neural spatio-temporal reasoning with object-centric self-supervised learning",
+    "authors": [
+      "David Ding",
+      "Felix Hill",
+      "Adam Santoro",
+      "Matthew Botvinick"
+    ],
+    "emails": [
+      "~Felix_Hill1",
+      "~Adam_Santoro1",
+      "~Matthew_Botvinick1",
+      "~David_Ding2"
+    ],
+    "rank": 1783
+  },
+  {
+    "url": "https://openreview.net/forum?id=fgpXAu8puGj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Neural architectures and hardware accelerators have been two driving forces for the rapid progress in deep learning.\nAlthough previous works have optimized either neural architectures given fixed hardware, or hardware given fixed neural architectures, none has considered optimizing them jointly. In this paper, we study the importance of co-designing neural architectures and hardware accelerators. To this end, we propose NAHAS, an automated hardware design paradigm that jointly searches for the best configuration for both neural architecture and accelerator. In NAHAS, accelerator hardware design is conditioned on the dynamically explored neural networks for the targeted application, instead of fixed architectures, thus providing better performance opportunities. Our experiments with an industry-standard edge accelerator show that NAHAS consistently outperforms previous platform-aware neural architecture search and state-of-the-art EfficientNet on all latency targets by 0.5% - 1% ImageNet top-1 accuracy, while reducing latency by about 20%. Joint optimization reduces the search samples by 2x and reduces the latency constraint violations from 3 violations to 1 violation per 4 searches, compared to independently optimizing the two sub spaces.",
+    "title": "NAHAS: Neural Architecture and Hardware Accelerator Search",
+    "authors": [
+      "Yanqi Zhou",
+      "Xuanyi Dong",
+      "Daiyi Peng",
+      "Ethan Zhu",
+      "Amir Yazdanbakhsh",
+      "Berkin Akin",
+      "Mingxing Tan",
+      "James Laudon"
+    ],
+    "emails": [
+      "~Amir_Yazdanbakhsh1",
+      "~Daiyi_Peng1",
+      "~Mingxing_Tan3",
+      "~Yanqi_Zhou1",
+      "~Xuanyi_Dong1",
+      "google.com",
+      "~Berkin_Akin1",
+      "~James_Laudon1"
+    ],
+    "rank": 1784
+  },
+  {
+    "url": "https://openreview.net/forum?id=pAJ3svHLDV",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Decomposing a complex scene into multiple objects is a natural instinct of an intelligent vision system. Recently, the interest in unsupervised scene representation learning emerged and many previous works tackle this by decomposing scenes into object representations either in the form of segmentation masks or position and scale latent variables (i.e. bounding boxes). We observe that these two types of representation both contain object geometric information and should be consistent with each other. Inspired by this observation, we provide an unsupervised generative framework called R-MONet that can generate objects geometric representation in the form of bounding boxes and segmentation masks simultaneously. While bounding boxes can represent the region of interest (ROI) for generating foreground segmentation masks, the foreground segmentation masks can also be used to supervise bounding boxes learning with the Multi-Otsu Thresholding method. Through the experiments on CLEVR and Multi-dSprites datasets, we show that ensuring the consistency of two types of representation can help the model to decompose the scene and learn better object geometric representations.",
+    "title": "R-MONet: Region-Based Unsupervised Scene Decomposition and Representation via Consistency of Object Representations",
+    "authors": [
+      "Shengxin Qian"
+    ],
+    "emails": [
+      "~Shengxin_Qian1"
+    ],
+    "rank": 1785
+  },
+  {
+    "url": "https://openreview.net/forum?id=eoQBpdMy81m",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Federated averaging (FedAvg), despite its simplicity, has been the main approach in training neural networks in the federated learning setting. In this work, we show that the algorithmic choices of the FedAvg algorithm correspond to optimizing a single objective function that involves the global and all of the shard specific models using a hard version of the well known Expectation-Maximization (EM) algorithm. As a result, we gain a better understanding of the behavior and design choices of federated averaging while being able to provide interesting connections to recent literature. Based on this view, we further propose FedSparse, a version of federated averaging that employs prior distributions to promote model sparsity. In this way, we obtain a procedure that leads to reductions in both server-client and client-server communication costs as well as more efficient models. ",
+    "title": "Federated Averaging as Expectation Maximization",
+    "authors": [
+      "Christos Louizos",
+      "Matthias Reisser",
+      "Joseph Soriaga",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Matthias_Reisser1",
+      "qti.qualcomm.com",
+      "~Max_Welling1",
+      "~Christos_Louizos1"
+    ],
+    "rank": 1786
+  },
+  {
+    "url": "https://openreview.net/forum?id=Twm9LnWK-zt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Conditional Generative Adversarial Networks (cGAN) were designed to generate images based on the provided conditions, e.g., class-level distributions. However, existing methods have used the same generating architecture for all classes. This paper presents a novel idea that adopts NAS to find a distinct architecture for each class. The search space contains regular and class-modulated convolutions, where the latter is designed to introduce class-specific information while avoiding the reduction of training data for each class generator. The search algorithm follows a weight-sharing pipeline with mixed-architecture optimization so that the search cost does not grow with the number of classes. To learn the sampling policy, a Markov decision process is embedded into the search algorithm and a moving average is applied for better stability. We evaluate our approach on CIFAR10 and CIFAR100. Besides achieving better image generation quality in terms of FID scores, we discover several insights that are helpful in designing cGAN models.",
+    "title": "Searching towards Class-Aware Generators for Conditional Generative Adversarial Networks",
+    "authors": [
+      "Peng Zhou",
+      "Lingxi Xie",
+      "XIAOPENG ZHANG",
+      "Bingbing Ni",
+      "Qi Tian"
+    ],
+    "emails": [
+      "~Peng_Zhou2",
+      "~Bingbing_Ni3",
+      "~XIAOPENG_ZHANG7",
+      "~Qi_Tian3",
+      "~Lingxi_Xie1"
+    ],
+    "rank": 1787
+  },
+  {
+    "url": "https://openreview.net/forum?id=0migj5lyUZl",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "As a recognized variant and improvement for Trust Region Policy Optimization (TRPO), proximal policy optimization (PPO) has been widely used with several advantages: efficient data utilization, easy implementation and good parallelism. In this paper, a first-order gradient on-policy learning algorithm called Policy Optimization with Penalized Point Probability Distance (POP3D), which is a lower bound to the square of total variance divergence is proposed as another powerful variant. The penalty item has dual effects, prohibiting policy updates from overshooting and encouraging more explorations.  Carefully controlled experiments on both discrete and continuous benchmarks verify our approach is highly competitive to PPO.",
+    "title": "A Strong On-Policy Competitor To PPO",
+    "authors": [
+      "Xiangxiang Chu"
+    ],
+    "emails": [
+      "~Xiangxiang_Chu1"
+    ],
+    "rank": 1788
+  },
+  {
+    "url": "https://openreview.net/forum?id=UpZFBWxr1g3",
+    "ratings": [
+      6,
+      3,
+      7,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "As a widely used neural network model in NLP (neural language processing), transformer model achieves state-of-the-art performance in several translation tasks. Transformer model has a fixed skip connection architecture among different layers. However, the influence of other possible skip connection architectures are not discussed completely in transformer model. We search different architectures of skip connection to discover better architectures in different datasets. To improve the efficiency of trying different skip connection architectures, we apply the idea of network morphism to add skip connections as a procedure of fine-tuning. Our fine-tuning method outperforms the best models trained by the same or smaller datasets in WMT'16 En-De, WMT'14 En-Fr and WMT'18 En-De with 226M back-translation sentences. We also make experiment on transferring searched skip connection architectures to new transformer models.",
+    "title": "Improving Machine Translation by Searching Skip Connections Efficiently",
+    "authors": [
+      "Chen Yang",
+      "Houfeng Wang"
+    ],
+    "emails": [
+      "~Houfeng_Wang1",
+      "~Chen_Yang2"
+    ],
+    "rank": 1789
+  },
+  {
+    "url": "https://openreview.net/forum?id=WcdPUPkyffZ",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Word alignment is essential for the down-streaming cross-lingual language understanding and generation tasks. Recently, the performance of the neural word alignment models has exceeded that of statistical models. However, they heavily rely on sophisticated translation models. In this study, we propose a super lightweight unsupervised word alignment model, dubbed MirrorAlign, in which bidirectional symmetric attention trained with a contrastive learning objective is introduced, and an agreement loss is employed to bind the attention maps, such that the alignments follow mirror-like symmetry hypothesis. Experimental results on several public benchmarks demonstrate that our model achieves competitive, if not better, performance compared to the state of the art in word alignment while significantly reducing the training and decoding time on average. Further ablation analysis and case studies show the superiority of our proposed MirrorAlign. Notably, we recognize our model as a pioneer attempt to unify bilingual word embedding and word alignments. Encouragingly, our approach achieves 16.4X speedup against GIZA++, and 50X parameter compression compared with the Transformer-based alignment methods. We released our code to facilitate the community: <a href=\"https://github.com/ICLR20anonymous/mirroralign\" target=\"_blank\" rel=\"nofollow\">https://github.com/ICLR20anonymous/mirroralign</a>",
+    "title": "Unsupervised Word Alignment via Cross-Lingual  Contrastive Learning",
+    "authors": [
+      "Di Wu",
+      "Liang Ding",
+      "Shuo Yang",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Dacheng_Tao1",
+      "~Liang_Ding3",
+      "~Shuo_Yang7",
+      "~Di_Wu8"
+    ],
+    "rank": 1790
+  },
+  {
+    "url": "https://openreview.net/forum?id=Cn706AbJaKW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Mainstream machine learning conferences have seen a dramatic increase in the number of participants, along with a growing range of perspectives, in recent years.  Members of the machine learning community are likely to overhear allegations ranging from randomness of acceptance decisions to institutional bias.  In this work, we critically analyze the review process through a comprehensive study of papers submitted to ICLR between 2017 and 2020.  We quantify reproducibility/randomness in review scores and acceptance decisions, and examine whether scores correlate with paper impact.  Our findings suggest strong institutional bias in accept/reject decisions, even after controlling for paper quality. Furthermore, we find evidence for a gender gap, with female authors receiving lower scores, lower acceptance rates, and fewer citations per paper than their male counterparts.  We conclude our work with recommendations for future conference organizers. ",
+    "title": "An Open Review of OpenReview: A Critical Analysis of the Machine Learning Conference Review Process",
+    "authors": [
+      "David Tran",
+      "Alexander V Valtchanov",
+      "Keshav R Ganapathy",
+      "Raymond Feng",
+      "Eric Victor Slud",
+      "Micah Goldblum",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Raymond_Feng1",
+      "umd.edu",
+      "~David_Tran1",
+      "~Alexander_V_Valtchanov2",
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1",
+      "~Keshav_R_Ganapathy1"
+    ],
+    "rank": 1791
+  },
+  {
+    "url": "https://openreview.net/forum?id=9WlOIHve8dU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      7,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "One of the most classical problems in machine learning is how to learn binary trees that split data into meaningful partitions. From classification/regression via decision trees to hierarchical clustering, binary trees are useful because they (a) are often easy to visualize; (b) make computationally-efficient predictions; and (c) allow for flexible partitioning. Because of this there has been extensive research on how to learn such trees. Optimization generally falls into one of three categories: 1. greedy node-by-node optimization; 2. probabilistic relaxations for differentiability; 3. mixed-integer programming (MIP). Each of these have downsides: greedy can myopically choose poor splits, probabilistic relaxations do not have principled ways to prune trees, MIP methods can be slow on large problems and may not generalize. In this work we derive a novel sparse relaxation for binary tree learning. By sparsely relaxing a new MIP, our approach is able to learn tree splits and tree pruning using state-of-the-art gradient-based approaches. We demonstrate how our approach is easily visualizable, is efficient, and is competitive with current work in classification/regression and hierarchical clustering.",
+    "title": "Learning Binary Trees via Sparse Relaxation",
+    "authors": [
+      "Valentina Zantedeschi",
+      "Matt Kusner",
+      "Vlad Niculae"
+    ],
+    "emails": [
+      "~Matt_Kusner1",
+      "~Vlad_Niculae2",
+      "~Valentina_Zantedeschi2"
+    ],
+    "rank": 1792
+  },
+  {
+    "url": "https://openreview.net/forum?id=S2UB9PkrEjF",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "This work considers two distinct settings: imitation learning and goal-conditioned reinforcement learning. In either case, effective solutions require the agent to reliably reach a specified state (a goal), or set of states (a demonstration). Drawing a connection between probabilistic long-term dynamics and the desired value function, this work introduces an approach that utilizes recent advances in density estimation to effectively learn to reach a given state. We develop a unified view on the two settings and show that the approach can be applied to both. In goal-conditioned reinforcement learning, we show it to circumvent the problem of sparse rewards while addressing hindsight bias in stochastic domains. In imitation learning, we show that the approach can learn from extremely sparse amounts of expert data and achieves state-of-the-art results on a common benchmark.",
+    "title": "Universal Value Density Estimation for Imitation Learning and Goal-Conditioned Reinforcement Learning",
+    "authors": [
+      "Yannick Schroecker",
+      "Charles Lee Isbell"
+    ],
+    "emails": [
+      "~Yannick_Schroecker1",
+      "~Charles_Lee_Isbell1"
+    ],
+    "rank": 1793
+  },
+  {
+    "url": "https://openreview.net/forum?id=iKXWZru0DS",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Despite the success of reinforcement learning methods, they have yet to have their breakthrough moment when applied to a broad range of robotic manipulation tasks. This is partly due to the fact that reinforcement learning algorithms are notoriously difficult and time consuming to train, which is exacerbated when training from images rather than full-state inputs. As humans perform manipulation tasks, our eyes closely monitor every step of the process with our gaze focusing sequentially on the objects being manipulated. With this in mind, we present our Attention-driven Robotic Manipulation (ARM) algorithm, which is a general manipulation algorithm that can be applied to a range of real-world sparse-rewarded tasks without any prior task knowledge. ARM splits the complex task of manipulation into a 3 stage pipeline: (1) a Q-attention agent extracts interesting pixel locations from RGB and point cloud inputs, (2) a next-best pose agent that accepts crops from the Q-attention agent and outputs poses, and (3) a control agent that takes the goal pose and outputs joint actions. We show that current state-of-the-art reinforcement learning algorithms catastrophically fail on a range of RLBench tasks, whilst ARM is successful within a few hours.",
+    "title": "Attention-driven Robotic Manipulation",
+    "authors": [
+      "Stephen James",
+      "Andrew Davison"
+    ],
+    "emails": [
+      "~Andrew_Davison1",
+      "~Stephen_James1"
+    ],
+    "rank": 1794
+  },
+  {
+    "url": "https://openreview.net/forum?id=Kz42iQirPJI",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Meta-learning has made rapid progress in past years, with recent extensions made to avoid catastrophic forgetting in the learning process, namely continual meta learning. It is desirable to generalize the meta learner\u2019s ability to continuously learn in sequential domains, which is largely unexplored to-date. We found through extensive empirical verification that significant improvement\nis needed for current continual learning techniques to be applied in the sequential domain meta learning setting. To tackle the problem, we adapt existing dynamic learning rate adaptation techniques to meta learn both model parameters and learning rates. Adaptation on parameters ensures good generalization performance, while adaptation on learning rates is made to avoid\ncatastrophic forgetting of past domains. Extensive experiments on a sequence of commonly used real-domain data demonstrate the effectiveness of our proposed method, outperforming current strong baselines in continual learning. Our code is made publicly available online (anonymous)",
+    "title": "Towards Learning to Remember in Meta Learning of Sequential Domains",
+    "authors": [
+      "Zhenyi Wang",
+      "Tiehang Duan",
+      "Donglin Zhan",
+      "Changyou Chen"
+    ],
+    "emails": [
+      "~Zhenyi_Wang1",
+      "~Donglin_Zhan1",
+      "~Changyou_Chen1",
+      "~Tiehang_Duan1"
+    ],
+    "rank": 1795
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z_TwEk_sP34",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Despite the immense success that deep neural networks (DNNs) have achieved, \\emph{adversarial examples}, which are perturbed inputs that aim to mislead DNNs to make mistakes, have recently led to great concerns. On the other hand, adversarial examples exhibit interesting phenomena, such as \\emph{adversarial transferability}. DNNs also exhibit knowledge transfer, which is critical to improving learning efficiency and learning in domains that lack high-quality training data. To uncover the fundamental connections between these phenomena, we investigate and give an affirmative answer to the question: \\emph{does adversarial transferability indicate knowledge transferability?} We theoretically analyze the relationship between adversarial transferability and knowledge transferability, and outline easily checkable sufficient conditions that identify when adversarial transferability indicates knowledge transferability. In particular, we show that composition with an affine function is sufficient to reduce the difference between two models when they possess high adversarial transferability. Furthermore, we provide empirical evaluation for different transfer learning scenarios on diverse datasets, showing a strong positive correlation between the adversarial transferability and knowledge transferability, thus illustrating that our theoretical insights are predictive of practice.",
+    "title": "Does Adversarial Transferability Indicate Knowledge Transferability?",
+    "authors": [
+      "Kaizhao Liang",
+      "Jacky Y. Zhang",
+      "Oluwasanmi O Koyejo",
+      "Bo Li"
+    ],
+    "emails": [
+      "~Bo_Li19",
+      "~Oluwasanmi_O_Koyejo1",
+      "~Kaizhao_Liang1",
+      "~Jacky_Y._Zhang1"
+    ],
+    "rank": 1796
+  },
+  {
+    "url": "https://openreview.net/forum?id=i7aDkDEXJQU",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Unsupervised Neural Machine Translation or UNMT has received great attention\nin recent years. Though tremendous empirical improvements have been achieved,\nthere still lacks theory-oriented investigation and thus some fundamental\nquestions like \\textit{why} certain training protocol can work or not under\n\\textit{what} circumstances have not yet been well understood. This paper\nattempts to provide theoretical insights for the above questions. Specifically,\nfollowing the methodology of comparative study, we leverage two perspectives,\ni) \\textit{marginal likelihood maximization} and ii) \\textit{mutual information}\nfrom information theory, to understand the different learning effects from the\nstandard training protocol and its variants. Our detailed analyses reveal\nseveral critical conditions for the successful training of UNMT.",
+    "title": "Demystifying Learning of Unsupervised Neural Machine Translation",
+    "authors": [
+      "Guanlin Li",
+      "lemao liu",
+      "Taro Watanabe",
+      "Conghui Zhu",
+      "Tiejun Zhao"
+    ],
+    "emails": [
+      "~Tiejun_Zhao1",
+      "~lemao_liu1",
+      "~Guanlin_Li1",
+      "~Taro_Watanabe1",
+      "~Conghui_Zhu2"
+    ],
+    "rank": 1797
+  },
+  {
+    "url": "https://openreview.net/forum?id=0EJjoRbFEcX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Although deep neural networks are effective on supervised learning tasks, they have been shown to be brittle. They are prone to overfitting on their training distribution and are easily fooled by small adversarial perturbations. In this paper, we leverage generative models to identify and characterize instances where classifiers fail to generalize.\nWe propose a generative model of the features extracted by a classifier, and show using rigorous hypothesis testing that errors tend to occur when features are assigned low-probability by our model. From this observation, we develop a detection criteria that we test against different sources of classification mistakes: mistakes made on the test set due to poor model generalization, adversarial samples and out-of-distribution samples. Our approach is agnostic to class labels from the training set which makes it applicable to models trained in a semi-supervised way.",
+    "title": "Understanding Classifiers with Generative Models",
+    "authors": [
+      "La\u00ebtitia Shao",
+      "Yang Song",
+      "Stefano Ermon"
+    ],
+    "emails": [
+      "~Yang_Song1",
+      "~Stefano_Ermon1",
+      "~La\u00ebtitia_Shao1"
+    ],
+    "rank": 1798
+  },
+  {
+    "url": "https://openreview.net/forum?id=GPuvhWrEdUn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "To address the issue that deep neural networks (DNNs) are vulnerable to model inversion attacks, we design an objective function to adjust the separability of the hidden data representations as a way to control the trade-off between data utility and vulnerability to inversion attacks. Our method is motivated by the theoretical insights of data separability in neural networking training and results on the hardness of model inversion. Empirically, we show that there exist sweet-spots by adjusting the separability of data representation, such that it is difficult to recover data during inference while maintaining data utility.  ",
+    "title": "MixCon: Adjusting the Separability of Data Representations for Harder Data Recovery",
+    "authors": [
+      "Xiaoxiao Li",
+      "YANGSIBO HUANG",
+      "Binghui Peng",
+      "Zhao Song",
+      "Kai Li"
+    ],
+    "emails": [
+      "~Binghui_Peng1",
+      "~Xiaoxiao_Li1",
+      "~Kai_Li8",
+      "~YANGSIBO_HUANG1",
+      "~Zhao_Song3"
+    ],
+    "rank": 1799
+  },
+  {
+    "url": "https://openreview.net/forum?id=H8hgu4XsTXi",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      7
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Decision-making often requires accurate estimation of causal effects from observational data. This is challenging as outcomes of alternative decisions are not observed and have to be estimated. Previous methods estimate outcomes based on unconfoundedness but neglect any constraints that unconfoundedness imposes on the outcomes. In this paper, we propose a novel regularization framework in which we formalize unconfoundedness as an orthogonality constraint. We provide theoretical guarantees that this yields an asymptotically normal estimator for the average causal effect. Compared to other estimators, its asymptotic variance is strictly smaller. Based on our regularization framework, we develop deep orthogonal networks for unconfounded treatments (DONUT) which learn outcomes that are orthogonal to the treatment assignment. Using a variety of benchmark datasets for causal inference, we demonstrate that DONUT outperforms the state-of-the-art substantially.",
+    "title": "Estimating Treatment Effects via Orthogonal Regularization",
+    "authors": [
+      "Tobias Hatt",
+      "Stefan Feuerriegel"
+    ],
+    "emails": [
+      "~Tobias_Hatt1",
+      "ethz.ch"
+    ],
+    "rank": 1800
+  },
+  {
+    "url": "https://openreview.net/forum?id=7eD88byszZ",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Recent spectral graph sparsification research allows constructing nearly-linear-sized subgraphs that can well preserve the spectral (structural) properties of the original graph, such as the first few eigenvalues and eigenvectors of the graph Laplacian, leading to the development of a variety of nearly-linear time numerical and graph algorithms. However, there is not a unified approach that allows for truly scalable spectral sparsification of both directed and undirected graphs. For the first time, we prove the existence of linear-sized spectral sparsifiers for general directed\ngraphs and introduce a practically-efficient and unified spectral graph sparsification approach that allows sparsifying real-world, large-scale directed and undirected graphs with guaranteed preservation of the original graph spectra. By exploiting a highly-scalable (nearly-linear complexity) spectral matrix perturbation analysis framework for constructing nearly-linear sized (directed) subgraphs, it enables us to well preserve the key eigenvalues and eigenvectors of the original (directed) graph\nLaplacians. The proposed method has been validated using various kinds of directed graphs obtained from public domain sparse matrix collections, showing promising results for solving directed graph Laplacians, spectral embedding, and partitioning of general directed graphs, as well as approximately computing (personalized) PageRank vectors.",
+    "title": "A Unified Spectral Sparsification Framework for Directed Graphs",
+    "authors": [
+      "ying zhang",
+      "Zhiqiang Zhao",
+      "Zhuo Feng"
+    ],
+    "emails": [
+      "~Zhuo_Feng3",
+      "stevens.edu",
+      "~Zhiqiang_Zhao1"
+    ],
+    "rank": 1801
+  },
+  {
+    "url": "https://openreview.net/forum?id=mewtfP6YZ7",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      3,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "Model-based reinforcement learning (MBRL) methods have shown strong sample efficiency and performance across a variety of tasks, including when faced with high-dimensional visual observations. These methods learn to predict the environment dynamics and expected reward from interaction and use this predictive model to plan and perform the task. However, MBRL methods vary in their fundamental design choices, and it there is no strong consensus in the literature on how these design decisions affect performance. In this paper, we study a number of design decisions for the predictive model in visual MBRL algorithms, focusing specifically on methods that use a predictive model for planning. We find that a range of design decisions that are often considered crucial, such as the use of latent spaces, have little effect on task performance. A big exception to this finding is that predicting future observations (i.e., images) leads to significant task performance improvement compared to only predicting rewards. We also empirically find that image prediction accuracy, somewhat surprisingly, correlates more strongly with downstream task performance than reward prediction accuracy. We show how this phenomenon is related to exploration and how some of the lower-scoring models on standard benchmarks (that require exploration) will perform the same as the best-performing models when trained on the same training data. Simultaneously, in the absence of exploration, models that fit the data better usually perform better on the down-stream task as well, but surprisingly, these are often not the same models that perform the best when learning and exploring from scratch. These findings suggest that performance and exploration place important and potentially contradictory requirements on the model.",
+    "title": "On Trade-offs of Image Prediction in Visual Model-Based Reinforcement Learning",
+    "authors": [
+      "Mohammad Babaeizadeh",
+      "Mohammad Taghi Saffar",
+      "Danijar Hafner",
+      "Dumitru Erhan",
+      "Harini Kannan",
+      "Chelsea Finn",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Chelsea_Finn1",
+      "google.com",
+      "~Mohammad_Babaeizadeh1",
+      "~Danijar_Hafner1",
+      "~Mohammad_Taghi_Saffar1",
+      "~Sergey_Levine1",
+      "~Dumitru_Erhan1"
+    ],
+    "rank": 1802
+  },
+  {
+    "url": "https://openreview.net/forum?id=o29tNZZqGcN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks (GNN) are powerful models for many graph-structured tasks. In this paper, we aim to bridge GNN to lifelong learning, which is to overcome the effect of ``catastrophic forgetting\" for continuously learning a sequence of graph-structured tasks. Although many lifelong learning techniques for convolutional neural networks (CNN) have been developed, lifelong learning for GNN is still underexplored and suffers from incomplete graph structure during learning. This is because in lifelong learning the nodes increase dynamically and can only be present to the model once, which makes many graph models and sampling strategies inapplicable. To solve this problem, we propose a new graph topology based on feature interaction, called the feature graph. It takes features as new nodes and turns nodes into independent graphs. This successfully converts the original problem of node classification to graph classification, in which the increasing nodes are turned into training samples. Therefore, the lifelong learning techniques developed for CNN become applicable to GNN for the first time. In the experiments, we demonstrate both the efficiency and effectiveness of feature graphs for lifelong learning tasks using a rehearsal method. We expect that it will have broad potential applications for graph-structured tasks in lifelong learning.",
+    "title": "Bridging Graph Network to Lifelong Learning with Feature Interaction",
+    "authors": [
+      "Chen Wang",
+      "Yuheng Qiu",
+      "Sebastian Scherer"
+    ],
+    "emails": [
+      "~Sebastian_Scherer1",
+      "~Chen_Wang2",
+      "~Yuheng_Qiu1"
+    ],
+    "rank": 1803
+  },
+  {
+    "url": "https://openreview.net/forum?id=V8kdi5dRiDg",
+    "ratings": [
+      4,
+      5,
+      6,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Since collecting pixel-level groundtruth data is expensive, unsupervised visual understanding problems are currently an active research topic. In particular, several recent methods based on generative models have achieved promising results for object segmentation and saliency detection. However, since generative models are known to be unstable and sensitive to hyperparameters, the training of these methods can be challenging and time-consuming. \n\nIn this work, we introduce an alternative, much simpler way to exploit generative models for unsupervised object segmentation. First, we explore the latent spaces of the publicly available unsupervised models, such as BigBiGAN and StyleGAN2, and reveal the ``segmenting'' latent directions that can be used to obtain saliency masks for GAN-produced images. These masks then are used to train a discriminative segmentation model. Being very simple and easy-to-reproduce, our approach outperforms the state-of-the-art on common benchmarks in the unsupervised scenario.\nAll the code and the pretrained models are available online.",
+    "title": "Big GANs Are Watching You: Towards Unsupervised Object Segmentation with Off-the-Shelf Generative Models",
+    "authors": [
+      "Andrey Voynov",
+      "Stanislav Morozov",
+      "Artem Babenko"
+    ],
+    "emails": [
+      "~Andrey_Voynov1",
+      "~Artem_Babenko1",
+      "~Stanislav_Morozov1"
+    ],
+    "rank": 1804
+  },
+  {
+    "url": "https://openreview.net/forum?id=T3kmOP_cMFB",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      8,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Zeroth-order optimization (ZO) typically relies on two-point feedback to estimate the unknown gradient of the objective function, which queries the objective function value twice at each time instant. However, if the objective function is time-varying, as in online optimization, two-point feedback can not be used. In this case, the gradient can be estimated using one-point feedback that queries a single function value at each time instant, although at the expense of producing gradient estimates with large variance. In this work, we propose a new one-point feedback method for online optimization that estimates the objective function gradient using the residual between two feedback points at consecutive time instants. We study the regret bound of ZO with residual feedback for both convex and nonconvex online optimization problems. Specifically, for both Lipschitz and smooth functions, we show that using residual feedback produces gradient estimates with much smaller variance compared to conventional one-point feedback methods, which improves the learning rate. Our regret bound for ZO with residual feedback is tighter than the existing regret bound for ZO with conventional one-point feedback and relies on weaker assumptions, which suggests that ZO with our proposed residual feedback can better track the optimizer of online optimization problems. We provide numerical experiments that demonstrate that ZO with residual feedback significantly outperforms existing one-point feedback methods in practice.",
+    "title": "Boosting One-Point Derivative-Free Online Optimization via Residual Feedback",
+    "authors": [
+      "Yan Zhang",
+      "Yi Zhou",
+      "Kaiyi Ji",
+      "Michael Zavlanos"
+    ],
+    "emails": [
+      "duke.edu",
+      "~Kaiyi_Ji1",
+      "~Michael_Zavlanos1",
+      "~Yi_Zhou2"
+    ],
+    "rank": 1805
+  },
+  {
+    "url": "https://openreview.net/forum?id=RgDq8-AwvtN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "While deep learning (DL) has resulted in major breakthroughs in many applications, the frameworks commonly used in DL remain fragile to seemingly innocuous changes in the data. In response, adversarial training has emerged as a principled approach for improving the robustness of DL against norm-bounded perturbations.  Despite this progress, DL is also known to be fragile to unbounded shifts in the data distribution due to many forms of natural variation, including changes in weather or lighting in images.  However, there are remarkably few techniques that can address robustness to natural, out-of-distribution shifts in the data distribution in a general context.  To address this gap, we propose a paradigm shift from perturbation-based adversarial robustness to model-based robust deep learning.  Critical to our paradigm is to obtain models of natural variation, which vary data over a range of natural conditions.  Then by exploiting these models, we develop three novel model-based robust training algorithms that improve the robustness of DL with respect to natural variation. Our extensive experiments show that across a variety of natural conditions in twelve distinct datasets, classifiers trained with our algorithms significantly outperform classifiers trained via ERM, adversarial training, and domain adaptation techniques.  Specifically, when training on ImageNet and testing on various subsets of ImageNet-c, our algorithms improve over baseline methods by up to 30 percentage points in top-1 accuracy.  Further, we show that our methods provide robustness (1) against natural, out-of-distribution data, (2) against multiple simultaneous distributional shifts, and (3) to domains entirely unseen during training.",
+    "title": "Model-Based Robust Deep Learning: Generalizing to Natural, Out-of-Distribution Data",
+    "authors": [
+      "Alexander Robey",
+      "Hamed Hassani",
+      "George J. Pappas"
+    ],
+    "emails": [
+      "~George_J._Pappas1",
+      "~Hamed_Hassani2",
+      "~Alexander_Robey1"
+    ],
+    "rank": 1806
+  },
+  {
+    "url": "https://openreview.net/forum?id=0jPp4dKp3PL",
+    "decision": "Concerns raised (can publish with adjustment)",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Online grooming (OG) of children is a pervasive issue in an increasingly interconnected world. We explore various complementary methods to incorporate Corpus Linguistics (CL) knowledge into accurate and interpretable Deep Learning (DL) models. They provide an implicit text normalisation that adapts embedding spaces to the groomers' usage of language, and they focus the DNN's attention onto the expressions of OG strategies. We apply these integration to two architecture types and improve on the state-of-the-art on a new OG corpus.",
+    "title": "Integrating linguistic knowledge into DNNs: Application to online grooming detection",
+    "authors": [
+      "Jay Morgan",
+      "Adeline Paiement",
+      "Nuria Lorenzo-Dus",
+      "Anina Kinzel",
+      "Matteo Di Cristofaro"
+    ],
+    "emails": [
+      "~Jay_Morgan1",
+      "~Adeline_Paiement1",
+      "infogrep.it",
+      "swansea.ac.uk"
+    ],
+    "rank": 1807
+  },
+  {
+    "url": "https://openreview.net/forum?id=WcyZqS5o9C",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      5,
+      3
+    ],
+    "abstract": "Is critical input information encoded in specific sparse paths within the network? The pruning objective --- finding a subset of neurons for which the response remains unchanged --- has been used to discover such paths. However, we show that paths obtained from this objective do not necessarily encode the input features and also encompass (dead) neurons that were not originally contributing to the response. We investigate selecting paths based on neurons' contributions to the response to ensure that the paths envelop the critical segments of the encoded input information. We show that these paths have the property of being provably locally linear in an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container>-ball of the input, thus having stable gradients. This property is leveraged for proposing a feature attribution paradigm that is guided by neurons, therefore inherently taking interactions between input features into account. We evaluate the attribution methodology quantitatively in mainstream benchmarks.",
+    "title": "On Sparse Critical Paths of Neural Response",
+    "authors": [
+      "Ashkan Khakzar",
+      "Soroosh Baselizadeh",
+      "Saurabh Khanduja",
+      "Christian Rupprecht",
+      "Seong Tae Kim",
+      "Nassir Navab"
+    ],
+    "emails": [
+      "~Seong_Tae_Kim1",
+      "~Saurabh_Khanduja1",
+      "~Soroosh_Baselizadeh1",
+      "~Nassir_Navab1",
+      "~Christian_Rupprecht1",
+      "~Ashkan_Khakzar1"
+    ],
+    "rank": 1808
+  },
+  {
+    "url": "https://openreview.net/forum?id=S8IrSw9dF4W",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Backdoor attack intends to inject hidden backdoor into the deep neural networks (DNNs), such that the prediction of the infected model will be maliciously changed if the hidden backdoor is activated by the attacker-defined trigger, while it performs well on benign samples. Currently, most of existing backdoor attacks adopted the setting of \\emph{static} trigger, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>i</mi><mo>.</mo><mi>e</mi><mo>.</mo><mo>,</mo></math></mjx-assistive-mml></mjx-container> triggers across the training and testing images follow the same appearance and are located in the same area. In this paper, we revisit this attack paradigm by analyzing the characteristics of the static trigger. We demonstrate that such an attack paradigm is vulnerable when the trigger in testing images is not consistent with the one used for training. We further explore how to utilize this property for backdoor defense, and discuss how to alleviate such vulnerability of existing attacks. We hope that this work could inspire more explorations on backdoor properties, to help the design of more advanced backdoor defense and attack methods.",
+    "title": "Rethinking the Trigger of Backdoor Attack",
+    "authors": [
+      "Yiming Li",
+      "Tongqing Zhai",
+      "Baoyuan Wu",
+      "Yong Jiang",
+      "Zhifeng Li",
+      "Shu-Tao Xia"
+    ],
+    "emails": [
+      "~Zhifeng_Li5",
+      "~Tongqing_Zhai1",
+      "~Yong_Jiang3",
+      "~Shu-Tao_Xia1",
+      "~Baoyuan_Wu1",
+      "~Yiming_Li1"
+    ],
+    "rank": 1809
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oc-Aedbjq0",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "In this paper, we propose a novel channel pruning method to solve the problem of compression and acceleration of Convolutional Neural Networks (CNNs). Previous channel pruning methods usually ignore the relationships between channels and layers. Many of them parameterize each channel independently by using gates or similar concepts. To fill this gap, a hyper-structure network is proposed to generate the architecture of the main network. Like the existing hypernet, our hyper-structure network can be optimized by regular backpropagation. Moreover, we use a regularization term to specify the computational resource of the compact network. Usually, FLOPs is used as the criterion of computational resource. However, if FLOPs is used in the regularization, it may over penalize early layers. To address this issue, we further introduce learnable layer-wise scaling factors to balance the gradients from different terms, and they can be optimized by hyper-gradient descent. Extensive experimental results on CIFAR-10 and ImageNet show that our method is competitive with state-of-the-art methods. ",
+    "title": "Model Compression via Hyper-Structure Network",
+    "authors": [
+      "Shangqian Gao",
+      "Feihu Huang",
+      "Heng Huang"
+    ],
+    "emails": [
+      "~Shangqian_Gao1",
+      "~Feihu_Huang1",
+      "~Heng_Huang1"
+    ],
+    "rank": 1810
+  },
+  {
+    "url": "https://openreview.net/forum?id=tmMmIimNnp",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we propose a pseudo label fusion framework (PLF), a learning framework developed to deal with the domain gap between a source domain and a target domain for performing semantic segmentation based UDA in the unseen target domain. PLF fuses the pseudo labels generated by an ensemble of teacher models. The fused pseudo labels are then used by a student model to distill out the information embedded in these fused pseudo labels to perform semantic segmentation in the target domain. To examine the effectiveness of PLF, we perform a number of experiments on both  GTA5 to Cityscapes and SYNTHIA to Cityscapes benchmarks to quantitatively and qualitatively inspect the improvements achieved by employing PLF in performing semantic segmentation in the target domain. Moreover, we provide a number of parameter analyses to validate that the choices made in the design of PLF is both practical and beneficial. Our experimental results on both benchmarks shows that PLF indeed offers adequate performance benefits in performing semantic segmentation in the unseen domain, and is able to achieve competitive performance when compared to the contemporary UDA techniques.",
+    "title": "Semantic Segmentation Based Unsupervised Domain Adaptation via Pseudo-Label Fusion",
+    "authors": [
+      "Chen-Hao Chao",
+      "Bo-Wun Cheng",
+      "Chien Feng",
+      "Chun-Yi Lee"
+    ],
+    "emails": [
+      "~Chen-Hao_Chao2",
+      "~Chun-Yi_Lee1",
+      "~Bo-Wun_Cheng1",
+      "~Chien_Feng1"
+    ],
+    "rank": 1811
+  },
+  {
+    "url": "https://openreview.net/forum?id=1AyPW2Emp6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      3,
+      3
+    ],
+    "abstract": "Randomized smoothing is a popular way of providing robustness guarantees against adversarial attacks: randomly-smoothed functions have a universal Lipschitz-like bound, allowing for robustness certificates to be easily computed. In this work, we show that there also exists a universal curvature-like bound for Gaussian random smoothing: given the exact value and gradient of a smoothed function, we compute a lower bound on the distance of a point to its closest adversarial example, called the Second-order  Smoothing (SoS) robustness certificate. In addition to proving the correctness of this novel certificate, we show that SoS certificates are realizable and therefore tight. Interestingly, we show that the maximum achievable benefits, in terms of certified robustness, from using the additional information of the gradient norm are relatively small: because our bounds are tight, this is a fundamental negative result. The gain of SoS certificates further diminishes if we consider the estimation error of the gradient norms, for which we have developed an estimator. We therefore additionally develop a variant of Gaussian smoothing, called Gaussian dipole smoothing, which provides similar bounds to randomized smoothing with gradient information, but with much-improved sample efficiency. This allows us to achieve (marginally) improved robustness certificates on high-dimensional datasets such as CIFAR-10 and ImageNet.",
+    "title": "Tight Second-Order Certificates for Randomized Smoothing",
+    "authors": [
+      "Alexander Levine",
+      "Aounon Kumar",
+      "Tom Goldstein",
+      "Soheil Feizi"
+    ],
+    "emails": [
+      "~Tom_Goldstein1",
+      "~Alexander_Levine2",
+      "umd.edu",
+      "~Soheil_Feizi2"
+    ],
+    "rank": 1812
+  },
+  {
+    "url": "https://openreview.net/forum?id=g4E6SAAvACo",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The time and effort involved in hand-designing deep neural networks is immense. This has prompted the development of Neural Architecture Search (NAS) techniques to automate this design. However, NAS algorithms tend to be slow and expensive; they need to train vast numbers of candidate networks to inform the search process. This could be remedied if we could infer a network's trained accuracy from its initial state. In this work, we examine the correlation of linear maps induced by augmented versions of a single image in untrained networks and motivate how this can be used to give a measure which is highly indicative of a network\u2019s trained performance. We incorporate this measure into a simple algorithm that allows us to search for powerful networks without any training in a matter of seconds on a single GPU, and verify its effectiveness on NAS-Bench-101 and NAS-Bench-201. Finally, we show that our approach can be readily combined with more expensive search methods for added value: we modify regularised evolutionary search to produce a novel algorithm that outperforms its predecessor.",
+    "title": "Neural Architecture Search without Training",
+    "authors": [
+      "Joseph Mellor",
+      "Jack Turner",
+      "Amos Storkey",
+      "Elliot J. Crowley"
+    ],
+    "emails": [
+      "~Amos_Storkey1",
+      "~Jack_Turner1",
+      "~Elliot_J._Crowley1",
+      "ed.ac.uk"
+    ],
+    "rank": 1813
+  },
+  {
+    "url": "https://openreview.net/forum?id=uJSBC7QCfrX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper proposes Differential-Critic Generative Adversarial Network (DiCGAN) to learn the distribution of user-desired data when only partial instead of the entire dataset possesses the desired properties. Existing approaches select the desired samples first and train regular GANs on the selected samples to derive the user-desired data distribution. DiCGAN introduces a differential critic that can learn the preference direction from the pairwise preferences over the entire dataset. The resultant critic would guide the generation of the desired data instead of the whole data. Specifically, apart from the Wasserstein GAN loss, a ranking loss of the pairwise preferences is defined over the critic. It endows the difference of critic values between each pair of samples with the pairwise preference relation. The higher critic value indicates that the sample is preferred by the user. Thus training the generative model for higher critic values would encourage generating the user-preferred samples. Extensive experiments show that our DiCGAN can learn the user-desired data distributions.",
+    "title": "Differential-Critic GAN: Generating What You Want by a Cue of Preferences",
+    "authors": [
+      "Yinghua Yao",
+      "Yuangang Pan",
+      "Ivor Tsang",
+      "Xin Yao"
+    ],
+    "emails": [
+      "~Ivor_Tsang1",
+      "~Yinghua_Yao1",
+      "~Yuangang_Pan2",
+      "~Xin_Yao1"
+    ],
+    "rank": 1814
+  },
+  {
+    "url": "https://openreview.net/forum?id=wyiWss_DNoO",
+    "ratings": [
+      4,
+      6,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "Temporal modeling still remains challenging for action recognition in videos. To mitigate this issue, this paper presents a new video architecture, termed as Temporal Difference Network (TDN), with a focus on capturing multi-scale temporal information for efficient action recognition. The core of our TDN is to devise an efficient temporal module (TDM) by explicitly leveraging a temporal difference operator, and systematically assess its effect on short-term and long-term motion modeling. To fully capture temporal information over the entire video, our TDN is established with a two-level difference modeling paradigm. Specifically, for local motion modeling, temporal difference over consecutive frames is used to supply 2D CNNs with finer motion pattern, while for global motion modeling, temporal difference across segments is incorporated to capture long-range structure for motion feature excitation. TDN provides a simple and principled temporal modeling framework, and could be instantiated with the existing CNNs at a small extra computational cost. Our TDN presents a new state of the art on the datasets of Something-Something V1 \\&amp; V2 and Kinetics-400 under the setting of using similar backbones. In addition, we present some visualization results on our TDN and try to provide new insights on temporal difference operation.",
+    "title": "Temporal Difference Networks for Action Recognition",
+    "authors": [
+      "Limin Wang",
+      "Bin Ji",
+      "Zhan Tong",
+      "Gangshan Wu"
+    ],
+    "emails": [
+      "~Bin_Ji2",
+      "~Gangshan_Wu1",
+      "~Limin_Wang1",
+      "~Zhan_Tong1"
+    ],
+    "rank": 1815
+  },
+  {
+    "url": "https://openreview.net/forum?id=69EFStdgTD2",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      7,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      3,
+      5
+    ],
+    "abstract": "Increasingly machine learning systems are being deployed to edge servers and devices (e.g. mobile phones) and trained in a collaborative manner. Such distributed/federated/decentralized training raises a number of concerns about the robustness, privacy, and security of the procedure. While extensive work has been done in tackling with robustness, privacy, or security individually, their combination has rarely been studied. In this paper, we propose a secure multi-server protocol that offers both input privacy and Byzantine-robustness. In addition, this protocol is communication-efficient, fault-tolerant, and enjoys local differential privacy.",
+    "title": "Secure Byzantine-Robust Machine Learning",
+    "authors": [
+      "Lie He",
+      "Sai Praneeth Karimireddy",
+      "Martin Jaggi"
+    ],
+    "emails": [
+      "~Sai_Praneeth_Karimireddy1",
+      "~Martin_Jaggi1",
+      "~Lie_He1"
+    ],
+    "rank": 1816
+  },
+  {
+    "url": "https://openreview.net/forum?id=T4gXBOXoIUr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Learning visual representations of medical images is core to medical image understanding but its progress has been held back by the small size of hand-labeled datasets. Existing work commonly relies on transferring weights from ImageNet pretraining, which is suboptimal due to drastically different image characteristics, or rule-based label extraction from the textual report data paired with medical images, which is inaccurate and hard to generalize. We propose an alternative unsupervised strategy to learn medical visual representations directly from the naturally occurring pairing of images and textual data. Our method of pretraining medical image encoders with the paired text data via a bidirectional contrastive objective between the two modalities is domain-agnostic, and requires no additional expert input. We test our method by transferring our pretrained weights to 4 medical image classification tasks and 2 zero-shot retrieval tasks, and show that our method leads to image representations that considerably outperform strong baselines in most settings. Notably, in all 4 classification tasks, our method requires only 10% as much labeled training data as an ImageNet initialized counterpart to achieve better or comparable performance, demonstrating superior data efficiency.",
+    "title": "Contrastive Learning of Medical Visual Representations from Paired Images and Text",
+    "authors": [
+      "Yuhao Zhang",
+      "Hang Jiang",
+      "Yasuhide Miura",
+      "Christopher D Manning",
+      "Curtis Langlotz"
+    ],
+    "emails": [
+      "~Christopher_D_Manning1",
+      "~Curtis_Langlotz1",
+      "stanford.edu",
+      "~Yuhao_Zhang3"
+    ],
+    "rank": 1817
+  },
+  {
+    "url": "https://openreview.net/forum?id=BW5PuV4V-rL",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We present an approach for efficiently training Gaussian Mixture Models by SGD on non-stationary, high-dimensional streaming data.\nOur training scheme does not require data-driven parameter initialization (e.g., k-means) and has the ability to process high-dimensional samples without numerical problems.\nFurthermore, the approach allows mini-batch sizes as low as 1, typical for streaming-data settings, and it is possible to react and adapt to changes in data statistics (concept drift/shift) without catastrophic forgetting.\nMajor problems in such streaming-data settings are undesirable local optima during early training phases and numerical instabilities due to high data dimensionalities.%, and catastrophic forgetting when encountering concept drift.\nWe introduce an adaptive annealing procedure to address the first problem,%, which additionally plays a decisive role in controlling the \\acp{GMM}' reaction to concept drift.\nwhereas numerical instabilities are eliminated by using an exponential-free approximation to the standard \\ac{GMM} log-likelihood.\nExperiments on a variety of visual and non-visual benchmarks show that our SGD approach can be trained completely without, for instance, k-means based centroid initialization, and compares favorably to sEM, an online variant of EM.",
+    "title": "Gradient-based training of Gaussian Mixture Models for High-Dimensional Streaming Data",
+    "authors": [
+      "Alexander Gepperth",
+      "Benedikt Pf\u00fclb"
+    ],
+    "emails": [
+      "~Benedikt_Pf\u00fclb1",
+      "~Alexander_Gepperth1"
+    ],
+    "rank": 1818
+  },
+  {
+    "url": "https://openreview.net/forum?id=NNd0J677PN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      2
+    ],
+    "abstract": "While federated learning (FL) enables distributed agents to collaboratively train a centralized model without sharing data with each other, it fails to protect users against inference attacks that mine private information from the centralized model. Thus, facilitating federated learning methods with differential privacy (DPFL) becomes attractive. Existing algorithms based on privately aggregating clipped gradients require many rounds of communication, which may not converge, and cannot scale up to large-capacity models due to explicit dimension-dependence in its added noise. In this paper, we adopt the knowledge transfer model of private learning pioneered by Papernot et al. (2017; 2018) and extend their algorithm PATE, as well as the recent alternative PrivateKNN (Zhu et al., 2020) to the federated learning setting. The key difference is that our method privately aggregates the labels from the agents in a voting scheme, instead of aggregating the gradients, hence avoiding the dimension dependence and achieving signi\ufb01cant savings in communication cost. Theoretically, we show that when the margins of the voting scores are large, the agents enjoy exponentially higher accuracy and stronger (data-dependent) differential privacy guarantees on both agent-level and instance-level. Extensive experiments show that our approach signi\ufb01cantly improves the privacy-utility trade-off over the current state-of-the-art in DPFL.",
+    "title": "Voting-based Approaches For Differentially Private Federated Learning",
+    "authors": [
+      "Yuqing Zhu",
+      "Xiang Yu",
+      "Yi-Hsuan Tsai",
+      "Francesco Pittaluga",
+      "Masoud Faraki",
+      "Manmohan Chandraker",
+      "Yu-Xiang Wang"
+    ],
+    "emails": [
+      "~Yu-Xiang_Wang1",
+      "~Masoud_Faraki2",
+      "~Yi-Hsuan_Tsai1",
+      "~Francesco_Pittaluga2",
+      "~Yuqing_Zhu1",
+      "~Manmohan_Chandraker3",
+      "~Xiang_Yu1"
+    ],
+    "rank": 1819
+  },
+  {
+    "url": "https://openreview.net/forum?id=O_PZRnYcUCm",
+    "ratings": [
+      4,
+      5,
+      5,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Blank",
+    "title": "Blank",
+    "authors": [
+      "Moitreya Chatterjee",
+      "Anoop Cherian",
+      "Narendra Ahuja"
+    ],
+    "emails": [
+      "~Narendra_Ahuja1",
+      "~Anoop_Cherian1",
+      "~Moitreya_Chatterjee1"
+    ],
+    "rank": 1820
+  },
+  {
+    "url": "https://openreview.net/forum?id=MPO4oML_JC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Exploration is critical for good results of deep reinforcement learning algorithms and has drawn much attention. However, existing multi-agent deep reinforcement learning algorithms still use mostly noise-based techniques. It was recognized recently that noise-based exploration is suboptimal in multi-agent settings, and exploration methods that consider agents' cooperation have been developed. However, existing methods suffer from a common challenge: agents struggle to identify states that are worth exploring, and don't coordinate their exploration efforts toward those states. To address this shortcoming, in this paper, we proposed coordinated multi-agent exploration (CMAE): agents share a common goal while exploring. The goal is selected by a normalized entropy-based technique from multiple projected state spaces. Then, agents are trained to reach the goal in a coordinated manner. We demonstrated that our approach needs only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mi mathvariant=\"normal\">%</mi><mo>\u2212</mo><mn>5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> of the environment steps to achieve similar or better returns than state-of-the-art baselines on various sparse-reward  tasks, including a sparse-reward version of the Starcraft multi-agent challenge (SMAC).",
+    "title": "Coordinated Multi-Agent Exploration Using Shared Goals",
+    "authors": [
+      "Iou-Jen Liu",
+      "Unnat Jain",
+      "Alex Schwing"
+    ],
+    "emails": [
+      "~Alex_Schwing1",
+      "~Unnat_Jain1",
+      "~Iou-Jen_Liu1"
+    ],
+    "rank": 1821
+  },
+  {
+    "url": "https://openreview.net/forum?id=hE3JWimujG",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      5,
+      3
+    ],
+    "abstract": "The brain solves the credit assignment problem remarkably well. For credit to be correctly assigned across multiple cortical areas a given area should, in principle, wait for others to finish their computation. How the brain deals with this locking problem has remained unclear. Deep learning methods suffer from similar locking constraints both on the forward and backward phase. Recently, decoupled neural interfaces (DNI) were introduced as a solution to the forward and backward locking problems.\nHere we propose that a specialised brain region, the cerebellum, helps the cerebral cortex solve the locking problem closely matching the computations and architecture of DNI. In particular, we propose that classical cerebellar forward and inverse models are equivalent to solving the backward and forward locking problems, respectively. To demonstrate the potential of this framework we focus on modelling a given brain area as a recurrent neural network in which the cerebellum approximates temporal feedback signals as provided by BPTT. We tested the cortico-cerebellar-DNI (CC-DNI) model in a range of sensorimotor and cognitive tasks that have been shown to be cerebellar-dependent. First, we show that the CC-DNI unlocking mechanisms can facilitate learning in a simple target reaching task. Next, by building on the sequential MNIST task we demonstrate that these results generalise to more complex sensorimotor tasks. Our cortico-cerebellar model readily applies to a wider range of modalities, to demonstrate this we tested the model in a cognitive task, caption generation. Models without the cerebellar-DNI component exhibit deficits similar to those observed in cerebellar patients in both motor and cognitive tasks. Moreover, we used CC-DNI to generate a set of specific neuroscience predictions. Finally, we introduce a CC-DNI model with highly sparse connectivity as observed in the cerebellum, which substantially reduces the number of parameters while improving learning through decorrelation.\nOverall, our work offers a novel perspective on the cerebellum as a brain-wide decoupling machine for efficient credit assignment and opens a new avenue of research between deep learning and neuroscience.",
+    "title": "Cortico-cerebellar networks as decoupled neural interfaces",
+    "authors": [
+      "Joseph Pemberton",
+      "Ellen Boven",
+      "Richard Apps",
+      "Rui Ponte Costa"
+    ],
+    "emails": [
+      "~Rui_Ponte_Costa3",
+      "bristol.ac.uk"
+    ],
+    "rank": 1822
+  },
+  {
+    "url": "https://openreview.net/forum?id=KcImcc3j-qS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "In Bayesian Deep Learning, distributions over the output of classification neural networks are approximated by first constructing a Gaussian distribution over the weights, then sampling from it to receive a distribution over the categorical output distribution. This is costly. We reconsider old work to construct a Dirichlet approximation of this output distribution, which yields an analytic map between Gaussian distributions in logit space and Dirichlet distributions (the conjugate prior to the categorical) in the output space. We argue that the resulting Dirichlet distribution has theoretical and practical advantages, in particular, more efficient computation of the uncertainty estimate, scaling to large datasets and networks like ImageNet and DenseNet. We demonstrate the use of this Dirichlet approximation by using it to construct a lightweight uncertainty-aware output ranking for the ImageNet setup.",
+    "title": "Fast Predictive Uncertainty for Classification with Bayesian Deep Networks",
+    "authors": [
+      "Marius Hobbhahn",
+      "Agustinus Kristiadi",
+      "Philipp Hennig"
+    ],
+    "emails": [
+      "~Philipp_Hennig1",
+      "~Agustinus_Kristiadi1",
+      "~Marius_Hobbhahn1"
+    ],
+    "rank": 1823
+  },
+  {
+    "url": "https://openreview.net/forum?id=xOBMyvoMQw8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Common Stochastic Gradient MCMC methods approximate gradients by stochastic ones via uniformly subsampled data points. A non-uniform subsampling scheme, however, can reduce the variance introduced by the stochastic approximation and make the sampling of a target distribution more accurate. For this purpose, an exponentially weighted stochastic gradient approach (EWSG) is developed to match the transition kernel of a non-uniform-SG-MCMC method with that of a batch-gradient-MCMC method. If needed to be put in the importance sampling (IS) category, EWSG can be viewed as a way to extend the IS+SG approach successful for optimization to the sampling setup. EWSG works for a range of MCMC methods, and a demonstration on Stochastic-Gradient 2nd-order Langevin is provided. In our practical implementation of EWSG, the non-uniform subsampling is performed efficiently via a Metropolis-Hasting chain on the data index, which is coupled to the sampling algorithm. The fact that our method has reduced local variance with high probability is theoretically analyzed. A non-asymptotic global error analysis is also presented. As a practical implementation contains hyperparameters, numerical experiments based on both synthetic and real world data sets are provided, to both demonstrate the empirical performances and recommend hyperparameter choices. Notably, while statistical accuracy has improved, the speed of convergence, with appropriately chosen hyper-parameters, was empirically observed to be at least comparable to the uniform version, which renders EWSG a practically useful alternative to common variance reduction treatments.",
+    "title": "Improving Sampling Accuracy of Stochastic Gradient MCMC Methods via Non-uniform Subsampling of Gradients",
+    "authors": [
+      "Ruilin Li",
+      "Xin Wang",
+      "Hongyuan Zha",
+      "Molei Tao"
+    ],
+    "emails": [
+      "~Hongyuan_Zha1",
+      "gmail.com",
+      "~Ruilin_Li1",
+      "~Molei_Tao1"
+    ],
+    "rank": 1824
+  },
+  {
+    "url": "https://openreview.net/forum?id=j39sWOYhfEg",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      6,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Adversarial training has been the topic of dozens of studies and a leading method for defending against adversarial attacks.\nYet, it remains largely unknown (a) how adversarially-robust ImageNet classifiers (R classifiers) generalize to out-of-distribution examples; and (b) how their generalization capability relates to their hidden representations. In this paper, we perform a thorough, systematic study to answer these two questions across AlexNet, GoogLeNet, and ResNet-50 architectures. We found that while standard ImageNet classifiers have a strong texture bias, their R counterparts rely heavily on shapes. Remarkably, adversarial training induces three simplicity biases into hidden neurons in the process of ``\"robustifying\" networks. That is, each convolutional neuron in R networks often changes to detecting (1) pixel-wise smoother patterns i.e. a mechanism that blocks high-frequency noise from passing through the network; (2) more lower-level features i.e. textures and colors (instead of objects); and (3) fewer types of inputs. Our findings reveal the interesting mechanisms that made networks more adversarially robust and also explain some recent findings e.g. why R networks benefit from much larger capacity and can act as a strong image prior in image synthesis. ",
+    "title": "The shape and simplicity biases of adversarially robust ImageNet-trained CNNs",
+    "authors": [
+      "Peijie Chen",
+      "Chirag Agarwal",
+      "Anh Nguyen"
+    ],
+    "emails": [
+      "~Chirag_Agarwal1",
+      "gmail.com",
+      "~Anh_Nguyen1"
+    ],
+    "rank": 1825
+  },
+  {
+    "url": "https://openreview.net/forum?id=NjpEx8XzDvm",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "A popular heuristic for improved performance in Generative adversarial networks (GANs) is to use some form of gradient penalty on the discriminator. This gradient penalty was originally motivated by a Wasserstein distance formulation. However, the use of gradient penalty in other GAN formulations is not well motivated. We present a unifying framework of expected margin maximization and show that a wide range of gradient-penalized GANs (e.g., Wasserstein, Standard, Least-Squares, and Hinge GANs) can be derived from this framework. Our results imply that employing gradient penalties induces a large-margin classifier (thus, a large-margin discriminator in GANs). We describe how expected margin maximization helps reduce vanishing gradients at fake (generated) samples, a known problem in GANs. From this framework, we derive a new <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msup></math></mjx-assistive-mml></mjx-container> gradient norm penalty with Hinge loss which generally produces equally good (or better) generated output in GANs than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>-norm penalties (based on the Fr\u00e9chet Inception Distance).",
+    "title": "Gradient penalty from a maximum margin perspective",
+    "authors": [
+      "Alexia Jolicoeur-Martineau",
+      "Ioannis Mitliagkas"
+    ],
+    "emails": [
+      "~Alexia_Jolicoeur-Martineau1",
+      "~Ioannis_Mitliagkas1"
+    ],
+    "rank": 1826
+  },
+  {
+    "url": "https://openreview.net/forum?id=mWnfMrd9JLr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": " Flow-based generative models typically define a latent space with dimensionality identical to the observational space. In many problems, however, the data does not populate the full ambient data-space that they natively reside in, but rather inhabit a lower-dimensional manifold. In such scenarios, flow-based models are unable to represent data structures exactly as their density will always have support off the data manifold, potentially resulting in degradation of model performance. In addition, the requirement for equal latent and data space dimensionality can unnecessarily increase model complexity for contemporary flow models. Towards addressing these problems, we propose to learn a manifold prior that affords benefits to both the tasks of sample generation and representation quality. An auxiliary product of our approach is that we are able to identify the intrinsic dimension of the data distribution.",
+    "title": "On the Latent Space of Flow-based Models",
+    "authors": [
+      "Mingtian Zhang",
+      "Yitong Sun",
+      "Steven McDonagh",
+      "Chen Zhang"
+    ],
+    "emails": [
+      "~Steven_McDonagh1",
+      "~Mingtian_Zhang1",
+      "~Yitong_Sun1",
+      "~Chen_Zhang6"
+    ],
+    "rank": 1827
+  },
+  {
+    "url": "https://openreview.net/forum?id=_Ea-ECV6Vkm",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we address the problem of learning the representations from images without human annotations. We study the instance classification solution, which regards each instance as a category, and improve the optimization and feature quality. The proposed consistent instance classification (ConIC) approach simultaneously optimizes the classification loss and an additional consistency loss explicitly penalizing the feature dissimilarity between the augmented views from the same instance. The benefit of optimizing the consistency loss is that the learned features for augmented views from the same instance are more compact and accordingly the classification loss optimization becomes easier, thus boosting the quality of the learned representations. This differs from InstDisc and MoCo that use an estimated prototype as the classifier weight to ease the optimization. Different from SimCLR that directly compares different instances, our approach does not require large batch size. Experimental results demonstrate competitive performance for linear evaluation and better performance than InstDisc, MoCo and SimCLR at downstream tasks, such as detection and segmentation, as well as competitive or superior performance compared to other methods with stronger training setting.",
+    "title": "Consistent Instance Classification for Unsupervised Representation Learning",
+    "authors": [
+      "Depu Meng",
+      "Zigang Geng",
+      "Zhirong Wu",
+      "Bin Xiao",
+      "Houqiang Li",
+      "Jingdong Wang"
+    ],
+    "emails": [
+      "~Depu_Meng1",
+      "~Jingdong_Wang1",
+      "~Bin_Xiao2",
+      "~Zigang_Geng1",
+      "~Houqiang_Li1",
+      "~Zhirong_Wu1"
+    ],
+    "rank": 1828
+  },
+  {
+    "url": "https://openreview.net/forum?id=PghuCwnjF6y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "We present TaskSet, a dataset of tasks for use in training and evaluating optimizers. TaskSet is unique in its size and diversity, containing over a thousand tasks ranging from image classification with fully connected or convolutional neural networks, to variational autoencoders, to non-volume preserving flows on a variety of datasets. As an example application of such a dataset we explore meta-learning an ordered list of hyperparameters to try sequentially. By learning this hyperparameter list from data generated using TaskSet we achieve large speedups in sample efficiency over random search. Next we use the diversity of the TaskSet and our method for learning hyperparameter lists to empirically explore the generalization of these lists to new optimization tasks in a variety of settings including ImageNet classification with Resnet50 and LM1B language modeling with transformers. As part of this work we have opensourced code for all tasks, as well as ~29 million training curves for these problems and the corresponding hyperparameters.",
+    "title": "TaskSet: A Dataset of Optimization Tasks",
+    "authors": [
+      "Luke Metz",
+      "Niru Maheswaranathan",
+      "Ruoxi Sun",
+      "C. Daniel Freeman",
+      "Ben Poole",
+      "Jascha Sohl-Dickstein"
+    ],
+    "emails": [
+      "~C._Daniel_Freeman1",
+      "~Jascha_Sohl-Dickstein2",
+      "~Niru_Maheswaranathan1",
+      "~Luke_Metz1",
+      "~Ruoxi_Sun2",
+      "~Ben_Poole1"
+    ],
+    "rank": 1829
+  },
+  {
+    "url": "https://openreview.net/forum?id=KjeUNkU2d26",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Content and style (C-S) disentanglement intends to decompose the underlying explanatory factors of objects into two independent latent spaces. Aiming for unsupervised disentanglement, we introduce an inductive bias to our formulation by assigning different and independent roles to content and style when approximating the real data distributions. The content embeddings of individual images are forced to share a common distribution. The style embeddings encoding instance-specific features are used to customize the shared distribution. The experiments on several popular datasets demonstrate that our method achieves the state-of-the-art disentanglement compared to other unsupervised approaches and comparable or even better results than supervised methods. Furthermore, as a new application of C-S disentanglement, we propose to generate multi-view images from a single view image for 3D reconstruction.",
+    "title": "Rethinking Content and Style: Exploring Bias for Unsupervised Disentanglement",
+    "authors": [
+      "Xuanchi Ren",
+      "Tao Yang",
+      "Wenjun Zeng",
+      "Yuwang Wang"
+    ],
+    "emails": [
+      "~Wenjun_Zeng3",
+      "~Yuwang_Wang3",
+      "~Xuanchi_Ren1",
+      "~Tao_Yang9"
+    ],
+    "rank": 1830
+  },
+  {
+    "url": "https://openreview.net/forum?id=lbHDMllIYI1",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Over-parameterization of neural networks is a well known issue that comes along with their great performance. Among the many approaches proposed to tackle this problem, low-rank tensor decompositions are largely investigated to compress deep neural networks. Such techniques rely on a low-rank assumption of the layer weight tensors that does not always hold in practice. Following this observation, this paper studies sparsity inducing techniques to build new sparse matrix product layer for high-rate neural networks compression. Specifically, we explore recent advances in sparse optimization to replace each layer's weight matrix, either convolutional or fully connected, by a product of sparse matrices. Our experiments validate that our approach provides a better compression-accuracy trade-off than most popular low-rank-based compression techniques.\n",
+    "title": "Sparse matrix products for neural network compression",
+    "authors": [
+      "Luc Giffon",
+      "hachem kadri",
+      "Stephane Ayache",
+      "Ronan Sicre",
+      "thierry artieres"
+    ],
+    "emails": [
+      "~hachem_kadri2",
+      "~Ronan_Sicre3",
+      "~thierry_artieres1",
+      "~Stephane_Ayache1",
+      "~Luc_Giffon1"
+    ],
+    "rank": 1831
+  },
+  {
+    "url": "https://openreview.net/forum?id=yZkF6xqhfQ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recently researchers have demonstrated that Transformers can be trained to learn symbolic tasks such as solving integration and differential equations in an end-to-end fashion. In these setups, for an input symbolic expression, the Transformer predicts the final solution in a single step. Since such tasks may consist of a sequence of logical steps, question remains whether such networks have understood and learnt individual steps to reach the solution. To take a deeper look, we consider the task of polynomial simplification. Polynomials can be written in a simple normal form as a sum of monomials which are ordered in a lexicographic order. For a polynomial which is not necessarily in this normal form, a sequence of simplification steps is applied to reach the fully simplified (i.e., in the normal form) polynomial. For this task, we describe a synthetic Polynomial dataset generation algorithm which generates polynomials with unique proof steps. Then, we conduct an extensive analysis of the Transformer\u2019s abilities to learn the polynomial simplification task along different dimensions.",
+    "title": "Do Transformers Understand Polynomial Simplification? ",
+    "authors": [
+      "Vishesh Agarwal",
+      "Somak Aditya",
+      "Navin Goyal"
+    ],
+    "emails": [
+      "~Somak_Aditya1",
+      "~Navin_Goyal1",
+      "microsoft.com"
+    ],
+    "rank": 1832
+  },
+  {
+    "url": "https://openreview.net/forum?id=pavee2r1N01",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recent work has demonstrated that neural networks are vulnerable to small, adversarial perturbations of their input. In this paper, we propose an efficient regularization scheme inspired by convex geometry and barrier methods to improve the robustness of feedforward ReLU networks. Since such networks are piecewise linear, they partition the input space into polyhedral regions (polytopes). Our regularizer is designed to minimize the distance between training samples and the \\textit{analytical centers} of their respective polytopes so as to push points away from the boundaries. Our regularizer \\textit{provably} improves a lower bound on the necessary adversarial perturbation required to switch an example's label. The addition of a second regularizer that encourages linear decision boundaries improves robustness while avoiding over-regularization of the classifier. We demonstrate the robustness of our approach with respect to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> adversarial perturbations on multiple datasets. Our method is competitive with state-of-the-art algorithms for learning robust networks. Moreover, applying our algorithm in conjunction with adversarial training boosts the robustness of classifiers even further.\n",
+    "title": "Provable Robustness by Geometric Regularization of ReLU Networks",
+    "authors": [
+      "Chester Holtz",
+      "Changhao Shi",
+      "Gal Mishne"
+    ],
+    "emails": [
+      "~Chester_Holtz1",
+      "~Gal_Mishne1",
+      "~Changhao_Shi1"
+    ],
+    "rank": 1833
+  },
+  {
+    "url": "https://openreview.net/forum?id=DsbhGImWjF",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "There has been a large literature on neural architecture search, but most existing work made use of heuristic rules that largely constrained the search flexibility. In this paper, we first relax these manually designed constraints and enlarge the search space to contain more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mn>10</mn><mrow><mn>117</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> candidates. In the new space, most existing differentiable search methods can fail dramatically. We then propose a novel algorithm named Gradual One-Level Differentiable Neural Architecture Search (GOLD-NAS) which introduces a variable resource constraint to one-level optimization so that the weak operators are gradually pruned out from the super-network. In standard image classification benchmarks, GOLD-NAS can find a series of Pareto-optimal architectures within a single search procedure. Most of the discovered architectures were never studied before, yet they achieve a nice tradeoff between recognition accuracy and model complexity. GOLD-NAS also shows generalization ability in extended search spaces with different candidate operators.\n",
+    "title": "GOLD-NAS: Gradual, One-Level, Differentiable",
+    "authors": [
+      "Kaifeng Bi",
+      "Lingxi Xie",
+      "Xin Chen",
+      "Longhui Wei",
+      "Qi Tian"
+    ],
+    "emails": [
+      "~Lingxi_Xie1",
+      "~Longhui_Wei1",
+      "~Qi_Tian3",
+      "huawei.com"
+    ],
+    "rank": 1834
+  },
+  {
+    "url": "https://openreview.net/forum?id=AT7jak63NNK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Reinforcement learning algorithms can acquire policies for complex tasks autonomously. However, the number of samples required to learn a diverse set of skills can be prohibitively large. While meta-reinforcement learning methods have enabled agents to leverage prior experience to adapt quickly to new tasks, their performance depends crucially on how close the new task is to the previously experienced tasks. Current approaches are either not able to extrapolate well, or can do so at the expense of requiring extremely large amounts of data for on-policy meta-training. In this work, we present model identification and experience relabeling (MIER), a meta-reinforcement learning algorithm that is both efficient and extrapolates well when faced with out-of-distribution tasks at test time. Our method is based on a simple insight: we recognize that dynamics models can be adapted efficiently and consistently with off-policy data, more easily than policies and value functions. These dynamics models can then be used to continue training policies and value functions for out-of-distribution tasks without using meta-reinforcement learning at all, by generating synthetic experience for the new task.",
+    "title": "Meta-Reinforcement Learning Robust to Distributional Shift via Model Identification and Experience Relabeling",
+    "authors": [
+      "Russell Mendonca",
+      "Xinyang Geng",
+      "Chelsea Finn",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Xinyang_Geng1",
+      "~Chelsea_Finn1",
+      "~Sergey_Levine1",
+      "~Russell_Mendonca1"
+    ],
+    "rank": 1835
+  },
+  {
+    "url": "https://openreview.net/forum?id=YZrQKLHFhv3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Convolutional neural networks (CNNs) are commonly trained using a fixed spatial image size predetermined for a given model. Although trained on images of a specific size, it is well established that CNNs can be used to evaluate a wide range of image sizes at test time, by adjusting the size of intermediate feature maps. <span>\\</span>\nIn this work, we describe and evaluate a novel mixed-size training regime that mixes several image sizes at training time. We demonstrate that models trained using our method are more resilient to image size changes and generalize well even on small images. This allows faster inference by using smaller images at test time. For instance, we receive a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>76.43</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> top-1 accuracy using ResNet50 with an image size of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>160</mn></math></mjx-assistive-mml></mjx-container>, which matches the accuracy of the baseline model with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> fewer computations.\nFurthermore, for a given image size used at test time, we show this method can be exploited either to accelerate training or the final test accuracy. For example, we are able to reach a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>79.27</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> accuracy with a model evaluated at a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>288</mn></math></mjx-assistive-mml></mjx-container> spatial size for a relative improvement of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>14</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> over the baseline.\nOur PyTorch implementation and pre-trained models are publicly available\\footnote{\\url{<a href=\"https://github.com/paper-submissions/mix-match}\" target=\"_blank\" rel=\"nofollow\">https://github.com/paper-submissions/mix-match}</a>",
+    "title": "MixSize: Training Convnets With Mixed Image Sizes for Improved Accuracy, Speed and Scale Resiliency",
+    "authors": [
+      "Elad Hoffer",
+      "Berry Weinstein",
+      "Itay Hubara",
+      "Tal Ben-Nun",
+      "Torsten Hoefler",
+      "Daniel Soudry"
+    ],
+    "emails": [
+      "inf.ethz.ch",
+      "~Tal_Ben-Nun1",
+      "~Berry_Weinstein1",
+      "~Daniel_Soudry1",
+      "~Elad_Hoffer1",
+      "~Itay_Hubara1"
+    ],
+    "rank": 1836
+  },
+  {
+    "url": "https://openreview.net/forum?id=3xUBgZQ04X",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Generative Adversarial Networks (GANs) are performant generative methods yielding high-quality samples. However, under certain circumstances, the training of GANs can lead to mode collapse or mode dropping, i.e. the generative models not being able to sample from the entire probability distribution. To address this problem, we use the last layer of the discriminator as a feature map to study the distribution of the real and the fake data. During training, we propose to match the real batch diversity to the fake batch diversity by using the Bures distance between covariance matrices in feature space. The computation of the Bures distance can be conveniently done in either feature space or kernel space in terms of the covariance and kernel matrix respectively. We observe that diversity matching reduces mode collapse substantially and has a positive effect on the sample quality. On the practical side, a very simple training procedure, that does not require additional hyperparameter tuning, is proposed and assessed on several datasets.  ",
+    "title": "The Bures Metric for Taming Mode Collapse in Generative Adversarial Networks",
+    "authors": [
+      "Hannes De Meulemeester",
+      "Joachim Schreurs",
+      "Micha\u00ebl Fanuel",
+      "Bart De Moor",
+      "Johan Suykens"
+    ],
+    "emails": [
+      "kuleuven.be",
+      "~Hannes_De_Meulemeester1",
+      "~Johan_Suykens1",
+      "esat.kuleuven.be"
+    ],
+    "rank": 1837
+  },
+  {
+    "url": "https://openreview.net/forum?id=CrWzAigsUEu",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "We introduce factorize-sum-split-product networks (FSPNs), a new class of probabilistic graphical models (PGMs). FSPNs are designed to overcome the drawbacks of existing PGMs in terms of estimation accuracy and inference efficiency. Specifically, Bayesian networks (BNs) have low inference speed and performance of tree-structured sum-product networks(SPNs) significantly degrades in presence of highly correlated variables. FSPNs absorb their advantages by adaptively modeling the joint distribution of variables according to their dependence degree, so that one can simultaneously attain the two desirable goals\u2014high estimation accuracy and fast inference speed. We present efficient probability inference and structure learning algorithms for FSPNs, along with a theoretical analysis and extensive evaluation evidence. Our experimental results on synthetic and benchmark datasets indicate the superiority of FSPN over other PGMs.",
+    "title": "FSPN: A New Class of Probabilistic Graphical Model",
+    "authors": [
+      "Ziniu Wu",
+      "Rong Zhu",
+      "Andreas Pfadler",
+      "Yuxing Han",
+      "Jiangneng Li",
+      "Zhengping Qian",
+      "Kai Zeng",
+      "Jingren Zhou"
+    ],
+    "emails": [
+      "~Andreas_Pfadler1",
+      "~Jingren_Zhou1",
+      "~Jiangneng_Li1",
+      "~Rong_Zhu2",
+      "alibaba-inc.com",
+      "~Yuxing_Han3",
+      "~Ziniu_Wu1"
+    ],
+    "rank": 1838
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z4YatHL7aq",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "We propose the Semantically-Adaptive UpSampling (SA-UpSample), a general and highly effective upsampling method for the layout-to-image translation task. SA-UpSample has three advantages: 1) Global view. Unlike traditional upsampling methods (e.g., Nearest-neighbor) that only exploit local neighborhoods, SA-UpSample can aggregate semantic information in a global view. \n2) Semantically adaptive.  Instead of using a fixed kernel for all locations (e.g., Deconvolution), SA-UpSample enables semantic class-specific upsampling via generating adaptive kernels for different locations. 3) Efficient. Unlike Spatial Attention which uses a fully-connected strategy to connect all the pixels, SA-UpSample only considers the most relevant pixels, introducing little computational overhead. We observe that SA-UpSample achieves consistent and substantial gains on six popular datasets. The source code will be made publicly available.",
+    "title": "Semantically-Adaptive Upsampling for Layout-to-Image Translation",
+    "authors": [
+      "Hao Tang",
+      "Nicu Sebe"
+    ],
+    "emails": [
+      "~Nicu_Sebe1",
+      "~Hao_Tang6"
+    ],
+    "rank": 1839
+  },
+  {
+    "url": "https://openreview.net/forum?id=4_57x7xhymn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Recent works in self-supervised video prediction have mainly focused on passive forecasting and low-level action-conditional prediction, which sidesteps the problem of semantic learning. We introduce the task of semantic action-conditional video prediction, which can be regarded as an inverse problem of action recognition. The challenge of this new task primarily lies in how to effectively inform the model of semantic action information. To bridge vision and language, we utilize the idea of capsule and propose a novel video prediction model Action Concept Grounding Network (AGCN). Our method is evaluated on two newly designed synthetic datasets, CLEVR-Building-Blocks and Sapien-Kitchen, and experiments show that given different action labels, our ACGN can correctly condition on instructions and generate corresponding future frames without need of bounding boxes. We further demonstrate our trained model can make out-of-distribution predictions for concurrent actions, be quickly adapted to new object categories and exploit its learnt features for object detection. Additional visualizations can be found at <a href=\"https://iclr-acgn.github.io/ACGN/\" target=\"_blank\" rel=\"nofollow\">https://iclr-acgn.github.io/ACGN/</a>.",
+    "title": "Action Concept Grounding Network for Semantically-Consistent Video Generation",
+    "authors": [
+      "Wei Yu",
+      "Wenxin Chen",
+      "Animesh Garg"
+    ],
+    "emails": [
+      "~Wenxin_Chen1",
+      "~Animesh_Garg1",
+      "~Wei_Yu10"
+    ],
+    "rank": 1840
+  },
+  {
+    "url": "https://openreview.net/forum?id=1TIrbngpW0x",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "An important development in deep learning from the earliest MLPs has been a move towards architectures with structural inductive biases which enable the model to keep distinct sources of information and routes of processing well-separated.  This structure is linked to the notion of independent mechanisms from the causality literature, in which a mechanism is able to retain the same processing as irrelevant aspects of the world are changed.  For example, convnets enable separation over positions, while attention-based architectures (especially Transformers) learn which combination of positions to process dynamically.  In this work we explore a way in which the Transformer architecture is deficient: it represents each position with a large monolithic hidden representation and a single set of parameters which are applied over the entire hidden representation.  This potentially throws unrelated sources of information together, and limits the Transformer's ability to capture independent mechanisms.  To address this, we propose Transformers with Independent Mechanisms (TIM), a new Transformer layer which divides the hidden representation and parameters into multiple mechanisms, which only exchange information through attention.  Additionally, we propose a competition mechanism which encourages these mechanisms to specialize over time steps, and thus be more independent.  We study TIM on a large scale BERT model, on the Image Transformer, and on speech enhancement and find evidence for semantically meaningful specialization as well as improved performance.  ",
+    "title": "Transformers with Competitive Ensembles of Independent Mechanisms",
+    "authors": [
+      "Alex Lamb",
+      "Di He",
+      "Anirudh Goyal",
+      "Guolin Ke",
+      "Chien-Feng Liao",
+      "Mirco Ravanelli",
+      "Yoshua Bengio"
+    ],
+    "emails": [
+      "~Yoshua_Bengio1",
+      "~Chien-Feng_Liao1",
+      "~Guolin_Ke3",
+      "~Mirco_Ravanelli1",
+      "~Anirudh_Goyal1",
+      "~Di_He1",
+      "~Alex_Lamb1"
+    ],
+    "rank": 1841
+  },
+  {
+    "url": "https://openreview.net/forum?id=RpprvYz0xTM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper studies the problem of novel category discovery on single- and multi-modal data with labels from different but relevant categories. We present a generic, end-to-end framework to jointly learn a reliable representation and assign clusters to unlabelled data. To avoid over-fitting the learnt embedding to labelled data, we take inspiration from self-supervised representation learning by noise-contrastive estimation and extend it to jointly handle labelled and unlabelled data. In particular, we proposed using category discrimination on labelled data and cross-modal discrimination on multi-modal data to augment instance discrimination used in conventional contrastive learning approaches. We further introduce Winner-Take-All (WTA) hashing algorithm on the shared representation space to generate pairwise pseudo labels for unlabelled data to better predict cluster assignments. We thoroughly evaluate our framework on large-scale multi-modal video benchmarks Kinetics-400 and VGG-Sound, and image benchmarks CIFAR10, CIFAR100 and ImageNet, obtaining state-of-the-art results.",
+    "title": "A Flexible Framework for Discovering Novel Categories with Contrastive Learning",
+    "authors": [
+      "Xuhui Jia",
+      "Kai Han",
+      "Yukun Zhu",
+      "Bradley Green"
+    ],
+    "emails": [
+      "~Kai_Han1",
+      "~Xuhui_Jia1",
+      "~Yukun_Zhu1",
+      "~Bradley_Green3"
+    ],
+    "rank": 1842
+  },
+  {
+    "url": "https://openreview.net/forum?id=jwgZh4Y4U7",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We aim to learn generalizable representations for complex activities by quantifying over both entities and time, as in \u201cthe kicker is behind all the other players,\u201d or \u201cthe player controls the ball until it moves toward the goal.\u201d  Such a structural inductive bias of object relations, object quantification, and temporal orders will enable the learned representation to generalize to situations with varying numbers of agents, objects, and time courses. In this paper, we present Temporal and Object Quantification Nets (TOQ-Nets), which provide such structural inductive bias for learning composable action concepts from time sequences that describe the properties and relations of multiple entities. We evaluate TOQ-Nets on two benchmarks: trajectory-based soccer event detection, and 6D pose-based manipulation concept learning. We demonstrate that TOQ-Nets can generalize from small amounts of data to scenarios where there are more agents and objects than were present during training. The learned concepts are also robust with respect to temporally warped sequences and easily transfer to other prediction tasks in a similar domain.",
+    "title": "Temporal and Object Quantification Nets",
+    "authors": [
+      "Jiayuan Mao",
+      "Zhezheng Luo",
+      "Chuang Gan",
+      "Joshua B. Tenenbaum",
+      "Jiajun Wu",
+      "Leslie Pack Kaelbling",
+      "Tomer Ullman"
+    ],
+    "emails": [
+      "~Zhezheng_Luo1",
+      "~Jiajun_Wu1",
+      "~Jiayuan_Mao1",
+      "~Leslie_Pack_Kaelbling1",
+      "~Joshua_B._Tenenbaum1",
+      "~Chuang_Gan1",
+      "~Tomer_Ullman1"
+    ],
+    "rank": 1843
+  },
+  {
+    "url": "https://openreview.net/forum?id=QM4_h99pjCE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Recent work in multi-agent reinforcement learning (MARL) by [Zhang, ICML12018] provided the first decentralized actor-critic algorithm to offer convergence guarantees. In that work, policies are stochastic and are defined on finite action spaces. We extend those results to develop a provably-convergent decentralized actor-critic algorithm for learning deterministic policies on continuous action spaces. Deterministic policies are important in many real-world settings. To handle the lack of exploration inherent in deterministic policies we provide results for the off-policy setting as well as the on-policy setting. We provide the main ingredients needed for this problem: the expression of a local deterministic policy gradient, a decentralized deterministic actor-critic algorithm, and convergence guarantees when the value functions are approximated linearly. This work enables decentralized MARL in high-dimensional action spaces and paves the way for more widespread application of MARL.",
+    "title": "Decentralized Deterministic Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Antoine Grosnit",
+      "Desmond Cai",
+      "Laura Wynter"
+    ],
+    "emails": [
+      "~Laura_Wynter1",
+      "polytechnique.edu",
+      "gmail.com"
+    ],
+    "rank": 1844
+  },
+  {
+    "url": "https://openreview.net/forum?id=YZ-NHPj6c6O",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Learning low-dimensional representations that disentangle the underlying factors of variation in data has been posited as an important step towards interpretable machine learning with good generalization. To address the fact that there is no consensus on what disentanglement entails, Higgins et al. (2018) propose a formal definition for Linear Symmetry-Based Disentanglement, or LSBD, arguing that underlying real-world transformations give exploitable structure to data.\n\nAlthough several works focus on learning LSBD representations, such methods require supervision on the underlying transformations for the entire dataset, and cannot deal with unlabeled data. Moreover, none of these works provide a metric to quantify LSBD.\n\nWe propose a metric to quantify LSBD representations that is easy to compute under certain well-defined assumptions. Furthermore, we present a method that can leverage unlabeled data, such that LSBD representations can be learned with limited supervision on transformations. Using our LSBD metric, our results show that limited supervision is indeed sufficient to learn LSBD representations.",
+    "title": "Quantifying and Learning Disentangled Representations with Limited Supervision",
+    "authors": [
+      "Loek Tonnaer",
+      "Luis Armando P\u00e9rez Rey",
+      "Vlado Menkovski",
+      "Mike Holenderski",
+      "Jacobus W. Portegies"
+    ],
+    "emails": [
+      "~Mike_Holenderski1",
+      "~Loek_Tonnaer1",
+      "tue.nl",
+      "~Vlado_Menkovski2",
+      "~Luis_Armando_P\u00e9rez_Rey1"
+    ],
+    "rank": 1845
+  },
+  {
+    "url": "https://openreview.net/forum?id=8FRw857AYba",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "We propose a novel Deep Neuroevolution algorithm, QD-RL, that combines the strengths of off-policy reinforcement learning (RL) algorithms and Quality Diversity (QD) approaches to solve continuous control problems with neural controllers. The QD part contributes structural biases by decoupling the search for diversity from the search for high return, resulting in efficient management of the exploration-exploitation trade-off. The RL part contributes sample efficiency by relying on off-policy gradient-based updates of the agents. More precisely, we train a population of off-policy deep RL agents to simultaneously maximize diversity within the population and the return of each individual agent. QD-RL selects agents interchangeably from a Pareto front or from a Map-Elites grid, resulting in stable and efficient population updates. Our experiments in the Ant-Maze and Ant-Trap environments show that QD-RL can solve challenging exploration and control problems with deceptive rewards while being two orders of magnitude more sample efficient than the evolutionary counterpart.",
+    "title": "Sample efficient Quality Diversity for neural continuous control",
+    "authors": [
+      "Thomas PIERROT",
+      "Valentin Mac\u00e9",
+      "Geoffrey Cideron",
+      "Nicolas Perrin",
+      "Karim Beguir",
+      "Olivier Sigaud"
+    ],
+    "emails": [
+      "~Nicolas_Perrin1",
+      "~Geoffrey_Cideron1",
+      "instadeep.com",
+      "~Thomas_PIERROT1",
+      "~Olivier_Sigaud1",
+      "~Valentin_Mac\u00e91"
+    ],
+    "rank": 1846
+  },
+  {
+    "url": "https://openreview.net/forum?id=OSynkDOWbk2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we investigate the performance of several discretization algorithms for two first-order finite-time optimization flows. These flows are, namely, the rescaled-gradient flow (RGF) and the signed-gradient flow (SGF), and consist of non-Lipscthiz or discontinuous dynamical systems that converge locally in finite time to the minima of gradient-dominated functions. We introduce three discretization methods for these first-order finite-time flows, and provide convergence guarantees. We then apply the proposed algorithms in training neural networks and empirically test their performances on three standard datasets, namely, CIFAR10, SVHN, and MNIST. Our results show that our schemes demonstrate faster convergences against standard optimization alternatives, while achieving equivalent or better accuracy.",
+    "title": "First-Order Optimization Algorithms via Discretization of Finite-Time Convergent Flows",
+    "authors": [
+      "Mouhacine Benosman",
+      "Orlando Romero",
+      "Anoop Cherian"
+    ],
+    "emails": [
+      "~Anoop_Cherian1",
+      "gmail.com",
+      "~Mouhacine_Benosman1"
+    ],
+    "rank": 1847
+  },
+  {
+    "url": "https://openreview.net/forum?id=YQVjbJPnPc9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      2
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transformer is a ubiquitous model for natural language processing and has also attracted wide attentions in other domains such as computer vision. The self-attention maps, learned independently for each layer, are indispensable for a transformer model to encode the dependencies among input tokens, however, learning them effectively is still a challenging problem. In this paper, we address this problem and propose a novel approach to improve self-attention through supplementary prediction modules. The underlying assumption is that the attention structures in the current layer should not be completely independent from those in the previous layer. Instead, we model their dependencies via a chain of prediction models that take previous attention maps as input to predict the attention maps of a new layer through convolutional neural networks. Specifically, we propose Predictive Attention Transformer and obtain significant performance gains for various kinds of tasks on top of multiple state-of-the-art models. On GLUE benchmark, the average performances of BERT-Base and BERT-Large are lifted by 4.1 and 2.5 points respectively. For machine translation, it improves the BLUE score of a vanilla Transformer consistently on IWSLT'14 De-En dataset with different model sizes. For ImageNet classification, we achieve significant improvement over a strong backbone model with comparable capacity.",
+    "title": "Predictive Attention Transformer: Improving Transformer with Attention Map Prediction",
+    "authors": [
+      "Yujing Wang",
+      "Yaming Yang",
+      "Jiangang Bai",
+      "Mingliang Zhang",
+      "Jing Bai",
+      "Jing Yu",
+      "Ce Zhang",
+      "Yunhai Tong"
+    ],
+    "emails": [
+      "pku.edu.cn",
+      "microsoft.com",
+      "~Yaming_Yang1",
+      "~Ce_Zhang1",
+      "iie.ac.cn",
+      "~Yujing_Wang1",
+      "~Yunhai_Tong1"
+    ],
+    "rank": 1848
+  },
+  {
+    "url": "https://openreview.net/forum?id=iqmOTi9J7E8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Splitting network computations between the edge device and the cloud server is a promising approach for enabling low edge-compute and private inference of neural networks. Current methods for providing the privacy train the model to minimize information leakage for a given set of private attributes. In practice, however, the test queries might contain private attributes that are not foreseen during training. \nWe propose an alternative solution, in which, instead of obfuscating the information corresponding to a set of attributes, the edge device discards the information irrelevant to the main task. To this end, the edge device runs the model up to a split layer determined based on its computational capacity and then removes the activation content that is in the null space of the next layer of the model before sending it to the server. It can further remove the low-energy components of the remaining signal to improve the privacy at the cost of reducing the accuracy. The experimental results show that our methods provide privacy while maintaining the accuracy and introducing only a small computational overhead. ",
+    "title": "Private Split Inference of Deep Networks",
+    "authors": [
+      "Mohammad Samragh",
+      "Hossein Hosseini",
+      "Kambiz Azarian",
+      "Joseph Soriaga"
+    ],
+    "emails": [
+      "~Hossein_Hosseini4",
+      "qti.qualcomm.com",
+      "~Mohammad_Samragh1"
+    ],
+    "rank": 1849
+  },
+  {
+    "url": "https://openreview.net/forum?id=aTozt4uZ5f",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "In many situations, the data one has access to at test time follows a different distribution from the training data. Over the years, this problem has been tackled by closed-set domain adaptation techniques. Recently, open-set domain adaptation has emerged to address the more realistic scenario where additional unknown classes are present in the target data. In this setting, existing techniques focus on the challenging task of isolating the unknown target samples, so as to avoid the negative transfer resulting from aligning the source feature distributions with the broader target one that encompasses the additional unknown classes. Here, we propose a simpler and more effective solution consisting of complementing the source data distribution and making it comparable to the target one by enabling the model to generate source samples corresponding to the unknown target classes. To this end, we attach a generative model to a standard domain adaptation network and augment the source data with the generated samples before matching the source distribution to the target one, thus avoiding negative transfer between the domains. We formulate this as a general module that can be incorporated into any existing closed-set approach and show that this strategy allows us to outperform the state of the art on standard open-set domain adaptation benchmark datasets.",
+    "title": "Learning to Generate the Unknowns for Open-set Domain Adaptation",
+    "authors": [
+      "Mahsa Baktashmotlagh",
+      "Tianle Chen",
+      "Mathieu Salzmann"
+    ],
+    "emails": [
+      "~Mahsa_Baktashmotlagh1",
+      "~Mathieu_Salzmann1",
+      "uq.edu.au"
+    ],
+    "rank": 1850
+  },
+  {
+    "url": "https://openreview.net/forum?id=dhQHk8ShEmF",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      7,
+      4,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Detecting out-of-distribution (OOD) inputs is critical for safely deploying deep learning models in an open-world setting. However, existing OOD detection solutions can be brittle in the open world, facing various types of adversarial OOD inputs. While methods leveraging auxiliary OOD data have emerged, our analysis reveals a key insight that the majority of auxiliary OOD examples may not meaningfully improve the decision boundary of the OOD detector. In this paper, we provide a theoretically motivated method, Adversarial Training with informative Outlier Mining (ATOM), which improves the robustness of OOD detection. We show that, by mining informative auxiliary OOD data, one can significantly improve OOD detection performance, and somewhat surprisingly, generalize to unseen adversarial attacks. ATOM achieves state-of-the-art performance under a broad family of classic and adversarial OOD evaluation tasks. For example, on the CIFAR-10 in-distribution dataset, ATOM reduces the FPR95 by up to 57.99\\% under adversarial OOD inputs, surpassing the previous best baseline by a large margin. ",
+    "title": "Informative Outlier Matters: Robustifying Out-of-distribution Detection Using Outlier Mining",
+    "authors": [
+      "Jiefeng Chen",
+      "Yixuan Li",
+      "Xi Wu",
+      "Yingyu Liang",
+      "Somesh Jha"
+    ],
+    "emails": [
+      "~Jiefeng_Chen2",
+      "~Xi_Wu1",
+      "~Yixuan_Li1",
+      "~Yingyu_Liang1",
+      "~Somesh_Jha1"
+    ],
+    "rank": 1851
+  },
+  {
+    "url": "https://openreview.net/forum?id=1MJPtHogkwX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Healthcare represents one of the most promising application areas for machine learning algorithms,  including modern methods based on deep learning.   Modern deep learning algorithms perform best on large datasets and on unstructured modalities such as text or image data; advances in deep learning have often been driven by the availability of such large datasets. Here, we introduce Multi-Modal Multitask MIMIC-III (M3) \u2014 a dataset and benchmark for evaluating machine learning algorithms in the healthcare domain.  This dataset contains multi-modal patient data collected from intensive care units \u2014 including physiological time series, clinical notes, ECG waveforms, and tabular inputs \u2014 and defines six clinical tasks \u2014 including predicting mortality, decompensation, readmission, and other outcomes \u2014 which serve as benchmarks for comparing algorithms. We introduce new multi-modal and multitask models for this dataset, and show that they outperform previous state-of-the-art results that only rely on a subset of all tasks and modalities. This highlights the potential of multitask and multi-modal learning to improve the performance of algorithms in the healthcare domain. More generally, we envision M3 as a general resource that will help accelerate research in applying machine learning to healthcare.",
+    "title": "A Multi-Modal and Multitask Benchmark in the Clinical Domain",
+    "authors": [
+      "Yong Huang",
+      "Edgar Mariano Marroquin",
+      "Volodymyr Kuleshov"
+    ],
+    "emails": [
+      "~Yong_Huang3",
+      "cornell.edu",
+      "~Edgar_Mariano_Marroquin1"
+    ],
+    "rank": 1852
+  },
+  {
+    "url": "https://openreview.net/forum?id=bGPNpnZYr1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      4,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Active learning strategy to query unlabeled samples nearer the estimated decision boundary at each step has been known to be effective when the distance from the sample data to the decision boundary can be explicitly evaluated; however, in numerous cases in machine learning, especially when it involves deep learning, conventional distance such as the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> from sample to decision boundary is not readily measurable. This paper defines a theoretical distance of unlabeled sample to the decision boundary as the least probable disagreement region (LPDR) containing the unlabeled sample, and it discusses how this theoretical distance can be empirically evaluated with a lower order of time complexity. Monte Carlo sampling of the hypothesis is performed in approximating the theoretically defined distance. Experimental results on various datasets show that the proposed algorithm consistently outperforms all other high performing uncertainty based active learning algorithms and leads to state-of-the-art active learning performance on CIFAR10, CIFAR100, Tiny ImageNet and Food101 datasets. Only the proposed algorithm outperforms random sampling on CIFAR100 dataset using K-CNN while all other algorithms fail to do so.",
+    "title": "Least Probable Disagreement Region for Active Learning",
+    "authors": [
+      "Seong Jin Cho",
+      "Gwangsu Kim",
+      "Chang D. Yoo"
+    ],
+    "emails": [
+      "~Seong_Jin_Cho1",
+      "~Gwangsu_Kim1",
+      "~Chang_D._Yoo1"
+    ],
+    "rank": 1853
+  },
+  {
+    "url": "https://openreview.net/forum?id=LtS9mII3jFi",
+    "ratings": [
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Complex-valued measurements in MRI and SAR imaging often have complex-valued scaling ambiguity, calling for models that are invariant to complex-valued scaling of pixels. Deep Complex Networks (DCN) extends real-valued algebra to complex-valued algebra in neural networks, but it does not address the issue of complex-valued scaling. SurReal complex-valued networks adopt a manifold view of complex numbers and derive a distance metric that is invariant to complex scaling. With distance features, it achieves complex-scaling invariance. However, rich complex-valued information is lost in this representation, and additionally,  SurReal is also prevented from using complex-valued non-linearity, limiting its expressive power. We simplify the manifold formulation of SurReal and propose a new layer function that achieves complex-scaling invariance within the complex domain.  We can then build hierarchical complex-valued features with complex-scaling invariance.  Our so-called HyperReal model results in a much leaner model with better generalization. Benchmarked on MSTAR, HyperReal beats DCN (and matches SurReal) with only 3%(40%) of their respective parameter counts.",
+    "title": "HyperReal:  Complex-Valued Layer Functions For Complex-Valued Scaling Invariance",
+    "authors": [
+      "Utkarsh Singhal",
+      "Yifei Xing",
+      "Stella Yu"
+    ],
+    "emails": [
+      "berkley.edu",
+      "~Stella_Yu2",
+      "~Utkarsh_Singhal1"
+    ],
+    "rank": 1854
+  },
+  {
+    "url": "https://openreview.net/forum?id=sgJJjd3-Y3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Advances in scientific fields including drug discovery or material design are accompanied by numerous trials and errors. However, generally only representative experimental results are reported. Because of this reporting bias, the distribution of labeled result data can deviate from their true distribution. A regression model can be erroneous if it is built on these skewed data. In this work, we propose a new approach to improve the accuracy of regression models that are trained using a skewed dataset. The method forces the regression outputs to follow the true distribution; the forcing algorithm regularizes the regression results while keeping the information of the training data. We assume the existence of enough unlabeled data that follow the true distribution, and that the true distribution can be roughly estimated from domain knowledge or a few samples. During training neural networks to generate a regression model, an adversarial network is used to force the distribution of predicted values to follow the estimated \u2018true\u2019 distribution. We evaluated the proposed approach on four real-world datasets (pLogP, Diamond, House, Elevators). In all four datasets, the proposed approach reduced the root mean squared error of the regression by around 55 percent to 75 percent compared to regression models without adjustment of the distribution.",
+    "title": "Semi-supervised regression with skewed data via adversarially forcing the distribution of predicted values",
+    "authors": [
+      "Dae-Woong Jeong",
+      "Kiyoung Kim",
+      "Changyoung Park",
+      "Sehui Han",
+      "Woohyung Lim"
+    ],
+    "emails": [
+      "lgsp.co.kr",
+      "~Woohyung_Lim1",
+      "~Dae-Woong_Jeong1"
+    ],
+    "rank": 1855
+  },
+  {
+    "url": "https://openreview.net/forum?id=HK_B2K0026",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep learning has shown great promise in arrhythmia classification in electrocar- diogram (ECG). Existing works, when classifying an ECG segment with multiple beats, do not identify the locations of the anomalies, which reduces clinical inter- pretability. On the other hand, segmenting abnormal beats by deep learning usu- ally requires annotation for a large number of regular and irregular beats, which can be laborious, sometimes even challenging, with strong inter-observer variabil- ity between experts. In this work, we propose a method capable of not only dif- ferentiating arrhythmia but also segmenting the associated abnormal beats in the ECG segment. The only annotation used in the training is the type of abnormal beats and no segmentation labels are needed. Imitating human\u2019s perception of an ECG signal, the framework consists of a segmenter and classifier. The segmenter outputs an attention map, which aims to highlight the abnormal sections in the ECG by element-wise modulation. Afterwards, the signals are sent to a classifier for arrhythmia differentiation. Though the training data is only labeled to super- vise the classifier, the segmenter and the classifier are trained in an end-to-end manner so that optimizing classification performance also adjusts how the abnor- mal beats are segmented. Validation of our method is conducted on two dataset. We observe that involving the unsupervised segmentation in fact boosts the clas- sification performance. Meanwhile, a grade study performed by experts suggests that the segmenter also achieves satisfactory quality in identifying abnormal beats, which significantly enhances the interpretability of the classification results.",
+    "title": "Attention Based Joint Learning for Supervised Electrocardiogram Arrhythmia Differentiation with Unsupervised Abnormal Beat Segmentation",
+    "authors": [
+      "Xinrong Hu",
+      "long wen",
+      "shushui wang",
+      "Dongpo Liang",
+      "Jian Zhuang",
+      "Yiyu Shi"
+    ],
+    "emails": [
+      "qq.com",
+      "~Jian_Zhuang1",
+      "~Xinrong_Hu1",
+      "163.com",
+      "~Yiyu_Shi1"
+    ],
+    "rank": 1856
+  },
+  {
+    "url": "https://openreview.net/forum?id=dGF96IxczpW",
+    "ratings": [
+      6,
+      4,
+      4,
+      6
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Model ensemble can effectively improve the accuracy of neural machine translation, which is accompanied by the cost of large computation and memory requirements. Additionally, model ensemble cannot combine the strengths of translation models with different decoding strategies since their translation probabilities cannot be directly aggregated. In this paper, we introduce an ensemble learning framework based on knowledge distillation to aggregate the knowledge of multiple teacher models into a single student model. Under this framework, we introduce word-level ensemble learning and sequence-level ensemble learning for neural machine translation, where sequence-level ensemble learning is capable of aggregating translation models with different decoding strategies. Experimental results on multiple translation tasks show that, by combining the two ensemble learning methods, our approach achieves substantial improvements over the competitive baseline systems and establishes a new single-model state-of-the-art BLEU score of 31.13 in the WMT14 English-German translation task.\\footnote{We will release the source code and the created SEL training data for reproducibility.}",
+    "title": "Knowledge Distillation based Ensemble Learning for Neural Machine Translation",
+    "authors": [
+      "Chenze Shao",
+      "Meng Sun",
+      "Yang Feng",
+      "Zhongjun He",
+      "hua wu",
+      "Haifeng Wang"
+    ],
+    "emails": [
+      "baidu.com",
+      "~Yang_Feng4",
+      "~hua_wu1",
+      "~Chenze_Shao1"
+    ],
+    "rank": 1857
+  },
+  {
+    "url": "https://openreview.net/forum?id=5zErZzsW2U1",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Deep neural networks have achieved great success in computer vision, thanks to their ability in extracting category-relevant semantic features. On the contrary, irrelevant features (e.g., background and confusing parts) are usually considered to be harmful. In this paper, we bring a new perspective on the potential benefits brought by irrelevant features: they could act as references to help identify relevant ones. Therefore, (1) we formulate a novel Category Disentangled Context (CDC) and develop an adversarial deep network to encode it; (2) we investigate utilizing the CDC to improve image classification with the attention mechanism as a bridge. Extensive comparisons on four benchmarks with various backbone networks demonstrate that the CDC could bring remarkable improvements consistently, validating the usefulness of irrelevant features.",
+    "title": "Category Disentangled Context: Turning Category-irrelevant Features  Into Treasures",
+    "authors": [
+      "Keke Tang",
+      "Guodong Wei",
+      "Jie Zhu",
+      "Yuexin Ma",
+      "Runnan Chen",
+      "Zhaoquan Gu",
+      "Wenping Wang"
+    ],
+    "emails": [
+      "~Keke_Tang2",
+      "~Jie_Zhu2",
+      "~Guodong_Wei1",
+      "~Zhaoquan_Gu2",
+      "~Yuexin_Ma2",
+      "~Wenping_Wang1",
+      "~Runnan_Chen1"
+    ],
+    "rank": 1858
+  },
+  {
+    "url": "https://openreview.net/forum?id=IqZpoAAt2oQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Few-shot-learning seeks to find models that are capable of fast-adaptation to novel tasks which are not encountered during training. Unlike typical few-shot learning algorithms, we propose a contrastive learning method which is not trained to solve a set of tasks, but rather attempts to find a good representation of the underlying data-generating processes (\\emph{functions}). This allows for finding representations which are useful for an entire series of tasks sharing the same function. In particular, our training scheme is driven by the self-supervision signal indicating whether two sets of samples stem from the same underlying function. Our experiments on a number of synthetic and real-world datasets show that the representations we obtain can outperform strong baselines in terms of downstream performance and noise robustness, even when these baselines are trained in an end-to-end manner.",
+    "title": "Function Contrastive Learning of Transferable Representations",
+    "authors": [
+      "Muhammad Waleed Gondal",
+      "Shruti Joshi",
+      "Nasim Rahaman",
+      "Stefan Bauer",
+      "Manuel Wuthrich",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Manuel_Wuthrich1",
+      "~Muhammad_Waleed_Gondal1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Stefan_Bauer1",
+      "~Nasim_Rahaman1",
+      "tuebingen.mpg.de"
+    ],
+    "rank": 1859
+  },
+  {
+    "url": "https://openreview.net/forum?id=-NEXDKk8gZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      2
+    ],
+    "abstract": "We explore denoising diffusion probabilistic models, a class of generative models which have recently been shown to produce excellent samples in the image and audio domains. While these models produce excellent samples, it has yet to be shown that they can achieve competitive log-likelihoods. We show that, with several small modifications, diffusion models can achieve competitive log-likelihoods in the image domain while maintaining high sample quality. Additionally, our models allow for sampling with an order of magnitude fewer diffusion steps with only a modest difference in sample quality. Finally, we explore how sample quality and log-likelihood scale with the number of diffusion steps and the amount of model capacity. We conclude that denoising diffusion probabilistic models are a promising class of generative models with excellent scaling properties and sample quality.",
+    "title": "Improved Denoising Diffusion Probabilistic Models",
+    "authors": [
+      "Alexander Quinn Nichol",
+      "Prafulla Dhariwal"
+    ],
+    "emails": [
+      "~Alexander_Quinn_Nichol1",
+      "~Prafulla_Dhariwal1"
+    ],
+    "rank": 1860
+  },
+  {
+    "url": "https://openreview.net/forum?id=bMzj6hXL2VJ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "It is a long-standing question to discover causal relations among a set of variables in many empirical sciences. Recently, Reinforcement Learning (RL) has achieved promising results in causal discovery. However, searching the space of directed graphs directly and enforcing acyclicity by implicit penalties tend to be inefficient and restrict the method to the small problems. In this work, we alternatively consider searching an ordering by RL from the variable ordering space that is much smaller than that of directed graphs, which also helps avoid dealing with acyclicity. Specifically, we formulate the ordering search problem as a Markov decision process, and then use different reward designs to optimize the ordering generating model. A generated ordering is then processed using variable selection methods to obtain the final directed acyclic graph. In contrast to other causal discovery methods, our method can also utilize a pretrained model to accelerate training. We conduct experiments on both synthetic and real-world datasets, and show that the proposed method outperforms other baselines on important metrics even on large graph tasks.",
+    "title": "Ordering-Based Causal Discovery with Reinforcement Learning",
+    "authors": [
+      "Xiaoqiang Wang",
+      "Yali Du",
+      "Shengyu Zhu",
+      "Liangjun Ke",
+      "Zhitang Chen",
+      "Jianye HAO",
+      "Jun Wang"
+    ],
+    "emails": [
+      "~Xiaoqiang_Wang2",
+      "~Shengyu_Zhu1",
+      "~Jianye_HAO1",
+      "~Jun_Wang2",
+      "~Zhitang_Chen1",
+      "~Yali_Du1",
+      "xjtu.edu.cn"
+    ],
+    "rank": 1861
+  },
+  {
+    "url": "https://openreview.net/forum?id=7_MJnN-U9hm",
+    "ratings": [
+      5,
+      5,
+      3,
+      5,
+      7
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Sparse neural networks have generated substantial interest recently because they can be more efficient in learning and inference, without any significant drop in performance. The \"lottery ticket hypothesis\" has showed the existence of such sparse subnetworks at initialization. Given a fully-connected initialized architecture, our aim is to find such \"winning ticket\" networks, without any training data. We first show the advantages of forming input-output paths, over pruning individual connections, to avoid bottlenecks in gradient propagation. Then, we show that Paths with Higher Edge-Weights (PHEW) at initialization have higher loss gradient magnitude, resulting in more efficient training. Selecting such paths can be performed without any data. We empirically validate the effectiveness of the proposed approach against pruning-before-training methods on CIFAR10, CIFAR100 and Tiny-ImageNet for VGG-Net and ResNet. PHEW achieves significant improvements on the current state-of-the-art methods at 10%, 5% and 2% network density. We also evaluate the structural similarity relationship between PHEW networks and pruned networks constructed through Iterated Magnitude Pruning (IMP), concluding that the former belong in the family of winning tickets networks.",
+    "title": "PHEW: Paths with Higher Edge-Weights give ''winning tickets'' without training data",
+    "authors": [
+      "Shreyas Malakarjun Patil",
+      "Constantine Dovrolis"
+    ],
+    "emails": [
+      "~Shreyas_Malakarjun_Patil1",
+      "~Constantine_Dovrolis1"
+    ],
+    "rank": 1862
+  },
+  {
+    "url": "https://openreview.net/forum?id=xP37gkVKa_0",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      5,
+      5
+    ],
+    "rating": "5.00",
+    "confidences": [
+      1,
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Search is an important tool for computing effective policies in single- and multi-agent environments, and has been crucial for achieving superhuman performance in several benchmark fully and partially observable games. However, one major limitation of prior search approaches for partially observable environments is that the computational cost scales poorly with the amount of hidden information. In this paper we present Learned Belief Search (LBS), a computationally efficient search procedure for partially observable environments. Rather than maintaining an exact belief distribution, LBS uses an approximate auto-regressive counterfactual belief that is learned as a supervised task. In multi-agent settings, LBS uses a novel public-private model architecture for underlying policies in order to efficiently evaluate these policies during rollouts. In the benchmark domain of Hanabi, LBS obtains more than 60% of the benefit of exact search while reducing compute requirements by 35\u00d7, allowing it to scale to larger settings that were inaccessible to previous search methods.",
+    "title": "Learned Belief Search: Efficiently Improving Policies in Partially Observable Settings",
+    "authors": [
+      "Hengyuan Hu",
+      "Adam Lerer",
+      "Noam Brown",
+      "Jakob Nicolaus Foerster"
+    ],
+    "emails": [
+      "~Hengyuan_Hu2",
+      "~Noam_Brown2",
+      "~Jakob_Nicolaus_Foerster1",
+      "~Adam_Lerer1"
+    ],
+    "rank": 1863
+  },
+  {
+    "url": "https://openreview.net/forum?id=1z_Hg9oBCtY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      3
+    ],
+    "rating": "5.00",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "We look at the problem of how the simulation of a financial market should be configured so that it most accurately emulates the behavior of a real market.  In particular, we address agent-based simulations of markets that are composed of many hundreds or thousands of trading agents. A solution to this problem is important because it provides a credible test bed for evaluating potential trading algorithms (e.g., execution strategies). Simple backtesting of such algorithms suffers from a critical weaknesses, chiefly that the overall market is not responsive to the candidate trading algorithm. Multi-agent simulations address this weakness by simulating {\\it market impact} via interaction between market participants. Calibration of such multi-agent simulators to ensure realism, however, is a challenge. In this paper, we present MAS-GAN -- a multi-agent simulator calibration method that allows to tune simulator parameters and to support more accurate evaluations of candidate trading algorithm. Our calibration focus is on high level parameters such as the relative proportions of the various types of agents that populate the simulation.\nMAS-GAN is a two-step approach: first, we train a discriminator that is able to distinguish between ``real'' and ``fake'' market data as a part of GAN with self-attention, and then utilize it within an optimization framework to refine simulation parameters. The paper concludes with quantitative examples of applying MAS-GAN to improve simulator realism.",
+    "title": "MAS-GAN: Adversarial Calibration of Multi-Agent Market Simulators.",
+    "authors": [
+      "Victor Storchan",
+      "Svitlana Vyetrenko",
+      "Tucker Balch"
+    ],
+    "emails": [
+      "~Svitlana_Vyetrenko1",
+      "~Tucker_Balch2",
+      "jpmchase.com"
+    ],
+    "rank": 1864
+  },
+  {
+    "url": "https://openreview.net/forum?id=34KAZ9HbJco",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.95",
+    "confidences": [
+      3,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "One crucial challenge of real-world multilingual speech recognition is the long-tailed distribution problem, where some resource-rich languages like English have abundant training data, but a long tail of low-resource languages have varying amounts of limited training data. To overcome the long-tail problem, in this paper, we propose Adapt-and-Adjust (A2), a transformer-based multi-task learning framework for end-to-end multilingual speech recognition. The A2 framework overcomes the long-tail problem via three techniques: (1) exploiting a pretrained multilingual language model (mBERT) to improve the performance of low-resource languages; (2) proposing dual adapters consisting of both language-specific and language-agnostic adaptation with minimal additional parameters; and (3) overcoming the class imbalance, either by imposing class priors in the loss during training or adjusting the logits of the softmax output during inference. Extensive experiments on the CommonVoice corpus show that A2 significantly outperforms conventional approaches.",
+    "title": "Adapt-and-Adjust: Overcoming the Long-tail Problem of Multilingual Speech Recognition",
+    "authors": [
+      "Genta Indra Winata",
+      "Guangsen Wang",
+      "Caiming Xiong",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Caiming_Xiong1",
+      "~Genta_Indra_Winata1",
+      "~Guangsen_Wang2"
+    ],
+    "rank": 1865
+  },
+  {
+    "url": "https://openreview.net/forum?id=px0-N3_KjA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      6,
+      2
+    ],
+    "rating": "4.95",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "The offline reinforcement learning (RL) problem, also known as batch RL, refers to the setting where a policy must be learned from a static dataset, without additional online data collection. This setting is compelling as it potentially allows RL methods to take advantage of large, pre-collected datasets, much like how the rise of large datasets has fueled results in supervised learning in recent years. However, existing online RL benchmarks are not tailored towards the offline setting, making progress in offline RL difficult to measure. In this work, we introduce benchmarks specifically designed for the offline setting, guided by key properties of datasets relevant to real-world applications of offline RL. Examples of such properties include: datasets generated via hand-designed controllers and human demonstrators, multi-objective datasets where an agent can perform different tasks in the same environment, and datasets consisting of a mixtures of policies. To facilitate research, we release our benchmark tasks and datasets with a comprehensive evaluation of existing algorithms and an evaluation protocol together with an open-source codebase. We hope that our benchmark will focus research effort on methods that drive improvements not just on simulated tasks, but ultimately on the kinds of real-world problems where offline RL will have the largest impact. ",
+    "title": "D4RL: Datasets for Deep Data-Driven Reinforcement Learning",
+    "authors": [
+      "Justin Fu",
+      "Aviral Kumar",
+      "Ofir Nachum",
+      "George Tucker",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~George_Tucker1",
+      "~Ofir_Nachum1",
+      "~Aviral_Kumar2",
+      "~Justin_Fu1",
+      "~Sergey_Levine1"
+    ],
+    "rank": 1866
+  },
+  {
+    "url": "https://openreview.net/forum?id=XZDeL25T12l",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      6
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Knowledge distillation (KD) is an effective technique to compress a large model (teacher) to a compact one (student) by knowledge transfer. The ideal case is that the teacher is compressed to the small student without any performance dropping. However, even for the state-of-the-art (SOTA) distillation approaches, there is still an obvious performance gap between the student and the teacher. The existing literature usually attributes this to model capacity differences between them. However, model capacity differences are unavoidable in model compression. In this work, we systematically study this question. By designing exploratory experiments, we find that model capacity differences are not necessarily the root reason, and the distillation data matters when the student capacity is greater than a threshold. In light of this, we propose to go beyond in-distribution distillation and accordingly develop KD+. KD+ is superior to the original KD as it outperforms KD and the other SOTA approaches substantially and is more compatible with the existing approaches to further improve their performances significantly.",
+    "title": "Can Students Outperform Teachers in Knowledge Distillation based Model Compression?",
+    "authors": [
+      "Xiang Deng",
+      "Zhongfei Zhang"
+    ],
+    "emails": [
+      "~Zhongfei_Zhang1",
+      "~Xiang_Deng1"
+    ],
+    "rank": 1867
+  },
+  {
+    "url": "https://openreview.net/forum?id=LvJ8hLSusrv",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.94",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Hamiltonian Monte Carlo (HMC) is one of the most successful sampling methods in machine learning. However, its performance is significantly affected by the choice of hyperparameter values, which require careful tuning. Existing approaches for automating this task either optimise a proxy for mixing speed or consider the HMC chain as an implicit variational distribution and optimize a tractable lower bound that is too loose to be useful in practice. Instead, we propose to optimize an objective that quantifies directly the speed of convergence to the target distribution. Our objective can be easily optimized using stochastic gradient descent.  We evaluate our proposed method and compare to baselines on a variety of problems including synthetic 2D distributions, the posteriors of variational autoencoders and the Boltzmann distribution for molecular configurations of a 22 atom molecule. We find our method is competitive with or improves upon alternative baselines on all problems we consider.",
+    "title": "Gradient-based tuning of Hamiltonian Monte Carlo hyperparameters",
+    "authors": [
+      "Andrew Campbell",
+      "Wenlong Chen",
+      "Vincent Stimper",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
+      "Yichuan Zhang"
+    ],
+    "emails": [
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1",
+      "cam.ac.uk",
+      "~Andrew_Campbell4",
+      "~Yichuan_Zhang1"
+    ],
+    "rank": 1868
+  },
+  {
+    "url": "https://openreview.net/forum?id=gkOYZpeGEK",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      6
+    ],
+    "rating": "4.94",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present a dimensionality reduction algorithm called Uniform Manifold Approximation with Two-phase Optimization (UMATO) which produces less biased global structures in the embedding results and is robust over diverse initialization methods than previous methods such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container>-SNE and UMAP. We divide the optimization into two phases to alleviate the bias by establishing the global structure early using the representatives of the high-dimensional structures. The phases are 1) global optimization to obtain the overall skeleton of data and 2) local optimization to identify the regional characteristics of local areas. In our experiments with one synthetic and three real-world datasets, UMATO outperformed widely-used baseline algorithms, such as PCA, red, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container>-SNE, UMAP, topological autoencoders and Anchor <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>t</mi></math></mjx-assistive-mml></mjx-container>-SNE, in terms of quality metrics and 2D projection results.",
+    "title": "Uniform Manifold Approximation with Two-phase Optimization",
+    "authors": [
+      "Hyung-Kwon Ko",
+      "Jaemin Jo",
+      "Yung-Kyun Noh",
+      "Jinwook Seo"
+    ],
+    "emails": [
+      "~Hyung-Kwon_Ko1",
+      "~Yung-Kyun_Noh1",
+      "~Jinwook_Seo1",
+      "~Jaemin_Jo1"
+    ],
+    "rank": 1869
+  },
+  {
+    "url": "https://openreview.net/forum?id=lJgbDxGhJ4r",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Modern semi-supervised learning methods conventionally assume both labeled and unlabeled data have the same class distribution. However, unlabeled data may include out-of-class samples in practice; those that cannot have one-hot encoded labels from a closed-set of classes in label data, i.e., unlabeled data is an open-set. In this paper, we introduce OpenCoS, a method for handling this realistic semi-supervised learning scenario based on a recent framework of contrastive learning. One of our key findings is that out-of-class samples in the unlabeled dataset can be identified effectively via (unsupervised) contrastive learning. OpenCoS utilizes this information to overcome the failure modes in the existing state-of-the-art semi-supervised methods, e.g., ReMixMatch or FixMatch. In particular, we propose to assign soft-labels for out-of-class samples using the representation learned from contrastive learning. Our extensive experimental results show the effectiveness of OpenCoS, fixing the state-of-the-art semi-supervised methods to be suitable for diverse scenarios involving open-set unlabeled data.",
+    "title": "OpenCoS: Contrastive Semi-supervised Learning for Handling Open-set Unlabeled Data",
+    "authors": [
+      "Jongjin Park",
+      "Sukmin Yun",
+      "Jongheon Jeong",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Jongheon_Jeong1",
+      "~Jongjin_Park1",
+      "~Sukmin_Yun1",
+      "~Jinwoo_Shin1"
+    ],
+    "rank": 1870
+  },
+  {
+    "url": "https://openreview.net/forum?id=Uf_WNt41tUA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Dialogue summarization is challenging due to its multi-speaker standpoints, casual spoken language, and limited labeled data. In this paper, we propose CorDial, aiming to improve the abstractive dialogue summarization quality and at the same time enable granularity controllability. We propose 1) a coarse-to-fine generation strategy that generates a summary draft followed by a final summary in an autoregressive way. The summary draft, which provides weakly-supervised signals, is composed of pseudo-labeled interrogative pronoun categories and noisy key phrases extracted with a constituency parser. 2) A simple strategy to control the granularity of the final summary. CorDial can predict and control the number of summary sentences for a given dialogue by predicting and highlighting different text spans from the source text. Our model achieves state-of-the-art performance on the largest dialogue summarization corpus SAMSum. We conduct comprehensive error analysis and show competitive human evaluation results to annotated summaries.",
+    "title": "CorDial: Coarse-to-fine Abstractive Dialogue Summarization with Controllable Granularity",
+    "authors": [
+      "Chien-Sheng Wu",
+      "Linqing Liu",
+      "Wenhao Liu",
+      "Pontus Stenetorp",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Chien-Sheng_Wu1",
+      "cs.ucl.ac.uk",
+      "gmail.com",
+      "~Caiming_Xiong1",
+      "salesforce.com"
+    ],
+    "rank": 1871
+  },
+  {
+    "url": "https://openreview.net/forum?id=GThGi8P9Vz",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We study the training of regularized neural networks where the regularizer can be non-smooth and non-convex.",
+    "title": "A Unified Framework for Proximal Methods",
+    "authors": [
+      "Jihun Yun",
+      "Aurelie Lozano",
+      "Eunho Yang"
+    ],
+    "emails": [
+      "~Eunho_Yang1",
+      "~Jihun_Yun2",
+      "~Aurelie_Lozano1"
+    ],
+    "rank": 1872
+  },
+  {
+    "url": "https://openreview.net/forum?id=udaowxM8rz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.94",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Neural Networks are sensitive to various corruptions that usually occur in real-world applications such as low-lighting conditions, blurs, noises, etc. To estimate the robustness of neural networks to these common corruptions, we generally use a group of modeled corruptions gathered into a benchmark. We argue that corruption benchmarks often have a poor coverage: being robust to them only implies being robust to a narrow range of corruptions. They are also often unbalanced: they give too much importance to some corruptions compared to others. In this paper, we propose to build corruption benchmarks with only non-overlapping corruptions, to improve their coverage and their balance. Two corruptions overlap when the robustnesses of neural networks to these corruptions are correlated. We propose the first metric to measure the overlapping between two corruptions. We provide an algorithm that uses this metric to build benchmarks of Non-Overlapping Corruptions. Using this algorithm, we build from ImageNet a new corruption benchmark called ImageNet-NOC. We show that ImageNet-NOC is balanced and covers several kinds of corruptions that are not covered by ImageNet-C.",
+    "title": "Increasing the Coverage and Balance of Robustness Benchmarks by Using Non-Overlapping Corruptions",
+    "authors": [
+      "Alfred LAUGROS",
+      "Alice Caplier",
+      "Matthieu Ospici"
+    ],
+    "emails": [
+      "~Matthieu_Ospici1",
+      "~Alfred_LAUGROS1",
+      "~Alice_Caplier2"
+    ],
+    "rank": 1873
+  },
+  {
+    "url": "https://openreview.net/forum?id=8SP2-AiWttb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.94",
+    "confidences": [
+      5,
+      4,
+      5,
+      2
+    ],
+    "abstract": "Evaluating the robustness of a defense model is a challenging task in adversarial robustness research. Obfuscated gradients, a type of gradient masking, have previously been found to exist in many defense methods and cause a false signal of robustness. In this paper, we identify a more subtle situation called \\emph{Imbalanced Gradients} that can also cause overestimated adversarial robustness. The phenomenon of imbalanced gradients occurs when the gradient of one term of the margin loss dominates and pushes the attack towards to a suboptimal direction. To exploit imbalanced gradients, we formulate a \\emph{Margin Decomposition (MD)} attack that decomposes a margin loss into individual terms and then explores the attackability of these terms separately via a two-stage process. We examine 12 state-of-the-art defense models, and find that models exploiting label smoothing easily cause imbalanced gradients, and on which our MD attacks can decrease their PGD robustness (evaluated by PGD attack) by over 23\\%. For 6 out of the 12 defenses, our attack can reduce their PGD robustness by at least 9\\%. The results suggest that imbalanced gradients need to be carefully addressed for more reliable adversarial robustness.",
+    "title": "Imbalanced Gradients: A New Cause of Overestimated Adversarial Robustness",
+    "authors": [
+      "Linxi Jiang",
+      "Xingjun Ma",
+      "Zejia Weng",
+      "James Bailey",
+      "Yu-Gang Jiang"
+    ],
+    "emails": [
+      "~Yu-Gang_Jiang1",
+      "fudan.edu.cn",
+      "~James_Bailey1",
+      "~Xingjun_Ma1"
+    ],
+    "rank": 1874
+  },
+  {
+    "url": "https://openreview.net/forum?id=Pgq5GE_-ph",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Deep learning has shown promise for accurately predicting high-dimensional video sequences. Existing video prediction models succeeded in generating sharp but often short video sequences. Toward improving long-term video prediction, we study hierarchical latent variable models with levels that process at different time scales. To gain insights into the representations of such models, we study the information stored at each level of the hierarchy via the KL divergence, predictive entropy, datasets of varying speed, and generative distributions. Our analysis confirms that faster changing details are generally captured by lower levels, while slower changing facts are remembered by higher levels. On synthetic datasets where common methods fail after 25 frames, we show that temporally abstract latent variable models can make accurate predictions for up to 200 frames.",
+    "title": "Video Prediction with Variational Temporal Hierarchies",
+    "authors": [
+      "Vaibhav Saxena",
+      "Jimmy Ba",
+      "Danijar Hafner"
+    ],
+    "emails": [
+      "~Vaibhav_Saxena1",
+      "~Danijar_Hafner1",
+      "~Jimmy_Ba1"
+    ],
+    "rank": 1875
+  },
+  {
+    "url": "https://openreview.net/forum?id=IneoHhrfv5",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We present a method to edit a target portrait footage by taking a sequence of audio as input to synthesize a photo-realistic video. This method is unique because it is highly dynamic.  It does not assume a person-specific rendering network yet capable of translating one source audio into one random chosen video output within a set of speech videos.  Instead of learning a highly heterogeneous and nonlinear mapping from audio to the video directly, we first factorize each target video frame into orthogonal parameter spaces, i.e., expression, geometry, and pose, via monocular  3D  face  reconstruction.   Next,  a  recurrent  network  is  introduced  to translate source audio into expression parameters that are primarily related to the audio content.  The audio-translated expression parameters are then used to synthesize a photo-realistic human subject in each video frame, with the movement of the mouth regions precisely mapped to the source audio.  The geometry and pose parameters of the target human portrait are retained, therefore preserving the con-text of the original video footage.  Finally, we introduce a novel video rendering network and a dynamic programming method to construct a temporally coherent and photo-realistic video.  Extensive experiments demonstrate the superiority of our method over existing approaches.   Our method is end-to-end learnable and robust to voice variations in the source audio.",
+    "title": "Everybody's Talkin': Let Me Talk as You Want",
+    "authors": [
+      "Linsen Song",
+      "Wayne Wu",
+      "Chen Qian",
+      "Ran He",
+      "Chen Change Loy"
+    ],
+    "emails": [
+      "~Wayne_Wu1",
+      "~Chen_Change_Loy2",
+      "~Linsen_Song1",
+      "~Ran_He1",
+      "~Chen_Qian1"
+    ],
+    "rank": 1876
+  },
+  {
+    "url": "https://openreview.net/forum?id=-HsAI7VKsz",
+    "ratings": [
+      6,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.94",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": " Recently proposed one-stage instance segmentation models (\\emph{e.g.}, SOLO) learn  to directly predict location-specific object mask   with fully-convolutional networks. They perform comparably well as the  traditional two-stage Mask R-CNN model, yet enjoying  much simpler architecture and   higher efficiency.  However, an intrinsic limitation of these models  is that they tend to generate similar mask predictions for a single object at nearby  locations, while most of them are directly discarded by non-maximum suppression, leading to a waste of  some useful predictions that can supplement the final result. In this work, we aim to explore how the model can benefit from better leveraging the neighboring predictions while maintaining the architectural simplicity and efficiency.   To this end, we develop a novel learning-based aggregation framework that learns to aggregate the neighboring predictions. Meanwhile, unlike original location-based masks, the segmentation model is implicitly supervised to learn location-aware \\textit{mask representations} that  encode the geometric structure of nearby objects and complements adjacent representations with context.  Based on the aggregation framework, we further introduce a mask interpolation mechanism that enables sharing mask representations for nearby spatial locations, thus allowing the model to generate much fewer representations for computation and memory saving.  We experimentally show that by simply augmenting the baseline model with our proposed aggregation framework, the instance segmentation performance is significantly improved. For instance, it improves a SOLO model with ResNet-101 backbone by 2.0 AP on the COCO benchmark, with only about 2\\% increase of computation. {Code and models} are available at   anonymous repository: {\\url{<a href=\"https://github.com/advdfacd/AggMask}}\" target=\"_blank\" rel=\"nofollow\">https://github.com/advdfacd/AggMask}}</a>.",
+    "title": "AggMask: Exploring locally aggregated learning of mask representations for instance segmentation",
+    "authors": [
+      "Tao Wang",
+      "Jun Hao Liew",
+      "Yu Li",
+      "Yunpeng Chen",
+      "Jiashi Feng"
+    ],
+    "emails": [
+      "~Jun_Hao_Liew1",
+      "~Tao_Wang3",
+      "~Jiashi_Feng1",
+      "~Yu_Li7",
+      "~Yunpeng_Chen1"
+    ],
+    "rank": 1877
+  },
+  {
+    "url": "https://openreview.net/forum?id=-qB7ZgRNRq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In spoken question answering, QA systems are designed to answer questions from contiguous text spans within the related speech transcripts. However, the most natural way that human seek or test their knowledge is via human conversations. Therefore, we propose a new Spoken Conversational Question Answering task (SCQA), aiming at enabling QA systems to model complex dialogues flow given the speech utterances and text corpora. In this task, our main objective is to build a QA system to deal with conversational questions both in spoken and text forms, and to explore the plausibility of providing more cues in spoken documents with systems in information gathering. To this end, instead of adopting automatically generated speech transcripts with highly noisy data, we propose a novel unified data distillation approach, DDNet, which directly fuse audio-text features to reduce the misalignment between automatic speech recognition hypotheses and the reference transcriptions. In addition, to evaluate the capacity of QA systems in a dialogue-style interaction, we assemble a Spoken Conversational Question Answering (Spoken-CoQA) dataset with more than 120k question-answer pairs. Experiments demonstrate that our proposed method achieves superior performance in spoken conversational question answering.\n",
+    "title": "Towards Data Distillation for End-to-end Spoken Conversational Question Answering",
+    "authors": [
+      "Chenyu You",
+      "Nuo Chen",
+      "Fenglin Liu",
+      "Dongchao Yang",
+      "Zhiyang Xu",
+      "Yuexian Zou"
+    ],
+    "emails": [
+      "~Nuo_Chen1",
+      "~Yuexian_Zou1",
+      "~Chenyu_You1",
+      "~Dongchao_Yang1",
+      "~Zhiyang_Xu1",
+      "~Fenglin_Liu1"
+    ],
+    "rank": 1878
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ux5zdAir9-U",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Relational inductive biases have a key role in building learning agents that can generalize and reason in a compositional manner. While relational learning algorithms such as graph neural networks (GNNs) show promise, we do not understand their effectiveness to adapt to new tasks. In this work, we study the logical generalization capabilities of GNNs by designing a benchmark suite grounded in first-order logic. Our benchmark suite, GraphLog, requires that learning algorithms perform rule induction in different synthetic logics, represented as knowledge graphs. \nGraphLog consists of relation prediction tasks on 57 distinct procedurally generated logical worlds. We use GraphLog to evaluate GNNs in three different setups: single-task supervised learning, multi-task (with pretraining), and continual learning. Unlike previous benchmarks, GraphLog enables us to precisely control the logical relationship between the different worlds by controlling the underlying first-order logic rules. We find that models' ability to generalize and adapt strongly correlates to the availability of diverse sets of logical rules during multi-task training. We also find the severe catastrophic forgetting effect in continual learning scenarios, and GraphLog provides a precise mechanism to control the distribution shift. Overall, our results highlight new challenges for the design of GNN models, opening up an exciting area of research in generalization using graph-structured data.",
+    "title": "GraphLog: A Benchmark for Measuring Logical Generalization in Graph Neural Networks",
+    "authors": [
+      "Koustuv Sinha",
+      "Shagun Sodhani",
+      "Joelle Pineau",
+      "William L. Hamilton"
+    ],
+    "emails": [
+      "~Joelle_Pineau1",
+      "~Koustuv_Sinha1",
+      "~William_L._Hamilton1",
+      "~Shagun_Sodhani1"
+    ],
+    "rank": 1879
+  },
+  {
+    "url": "https://openreview.net/forum?id=exa2mDqPb5E",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      5,
+      4
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Images of general objects are often composed of three hidden factors: category (e.g., car or chair), shape (e.g., particular car form), and view (e.g., 3D orientation).   While there have been many disentangling models that can discover either a category or shape factor separately from a view factor, such models typically cannot capture the structure of general objects that the diversity of shapes is much larger across categories than within a category.   Here, we propose a novel generative model called CIGMO, which can learn to represent the category, shape, and view factors at once only with weak supervision.  Concretely, we develop mixture of disentangling deep generative models, where the mixture components correspond to object categories and each component model represents shape and view in a category-specific and mutually invariant manner.  We devise a learning method based on variational autoencoders that does not explicitly use label information but uses only grouping information that links together different views of the same object.  Using several datasets of 3D objects including ShapeNet, we demonstrate that our model often outperforms previous relevant models including state-of-the-art methods in invariant clustering and one-shot classification tasks, in a manner exposing the importance of categorical invariant representation.\n",
+    "title": "CIGMO: Learning categorical invariant deep generative models from grouped data",
+    "authors": [
+      "Haruo Hosoya"
+    ],
+    "emails": [
+      "~Haruo_Hosoya1"
+    ],
+    "rank": 1880
+  },
+  {
+    "url": "https://openreview.net/forum?id=b_7OR0Fo_iN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      5,
+      2,
+      3
+    ],
+    "abstract": "Neighbor embeddings are a family of methods for visualizing complex high-dimensional datasets using kNN graphs. To find the low-dimensional embedding, these algorithms combine an attractive force between neighboring pairs of points with a repulsive force between all points. One of the most popular examples of such algorithms is t-SNE. Here we empirically show that changing the balance between the attractive and the repulsive forces in t-SNE yields a spectrum of embeddings, which is characterized by a simple trade-off: stronger attraction can better represent continuous manifold structures, while stronger repulsion can better represent discrete cluster structures. We find that UMAP embeddings correspond to t-SNE with increased attraction; mathematical analysis shows that this is because the negative sampling optimisation strategy employed by UMAP strongly lowers the effective repulsion. Likewise, ForceAtlas2, commonly used for visualizing developmental single-cell transcriptomic data, yields embeddings corresponding to t-SNE with the attraction increased even more. At the extreme of this spectrum lies Laplacian Eigenmaps, corresponding to zero repulsion. Our results demonstrate that many prominent neighbor embedding algorithms can be placed onto this attraction-repulsion spectrum, and highlight the inherent trade-offs between them. ",
+    "title": "A Unifying Perspective on Neighbor Embeddings along the Attraction-Repulsion Spectrum",
+    "authors": [
+      "Niklas B\u00f6hm",
+      "Philipp Berens",
+      "Dmitry Kobak"
+    ],
+    "emails": [
+      "~Philipp_Berens1",
+      "~Dmitry_Kobak2",
+      "~Niklas_B\u00f6hm1"
+    ],
+    "rank": 1881
+  },
+  {
+    "url": "https://openreview.net/forum?id=_zHHAZOLTVh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we propose a maximum mutual information (MMI) framework for multi-agent reinforcement learning (MARL) to enable multiple agents to learn coordinated behaviors by regularizing the accumulated return with the mutual information between actions. By introducing a latent variable to induce nonzero mutual information between actions and applying a variational bound, we derive a tractable lower bound on the considered MMI-regularized objective function. Applying policy iteration to maximize the derived lower bound, we propose a practical algorithm named variational maximum mutual information multi-agent actor-critic (VM3-AC), which follows centralized learning with decentralized execution (CTDE). We evaluated VM3-AC for several games requiring coordination, and numerical results show that VM3-AC outperforms MADDPG and other MARL algorithms in multi-agent tasks requiring coordination.",
+    "title": "A Maximum Mutual Information Framework for Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Woojun Kim",
+      "Whiyoung Jung",
+      "Myungsik Cho",
+      "Youngchul Sung"
+    ],
+    "emails": [
+      "~Youngchul_Sung1",
+      "~Whiyoung_Jung1",
+      "~Woojun_Kim1",
+      "kaist.ac.kr"
+    ],
+    "rank": 1882
+  },
+  {
+    "url": "https://openreview.net/forum?id=szXGN2CLjwf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Adam is a widely used stochastic optimization method for deep learning applications. While practitioners prefer Adam because it requires less parameter tuning, its use is problematic from a theoretical point of view since it may not converge.  Variants of Adam have been proposed with provable convergence guarantee, but they tend not be competitive with Adam on the practical performance. In this paper, we propose a new method named Adam<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mo>+</mo></msup></math></mjx-assistive-mml></mjx-container> (pronounced as Adam-plus). Adam<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mo>+</mo></msup></math></mjx-assistive-mml></mjx-container> retains some of the key components of Adam but it also has several noticeable differences: (i) it does not maintain the moving average of second moment estimate but instead computes the moving average of first moment estimate at extrapolated points; (ii) its adaptive step size is formed not by dividing the square root of second moment estimate but instead by dividing the root of the norm of first moment estimate. As a result,  Adam<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mo>+</mo></msup></math></mjx-assistive-mml></mjx-container> requires few parameter tuning, as Adam, but it enjoys a provable convergence guarantee. Our analysis further shows that Adam<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mo>+</mo></msup></math></mjx-assistive-mml></mjx-container> enjoys adaptive variance reduction, i.e., the variance of the stochastic gradient estimator reduces as the algorithm converges, hence enjoying an adaptive convergence. We also propose a more general variant of Adam<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mo>+</mo></msup></math></mjx-assistive-mml></mjx-container> with different adaptive step sizes and establish their fast convergence rate. Our empirical studies on various deep learning tasks, including image classification, language modeling, and automatic speech recognition, demonstrate that Adam<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mo>+</mo></msup></math></mjx-assistive-mml></mjx-container> significantly outperforms Adam and achieves comparable performance with best-tuned SGD and momentum SGD.",
+    "title": "Adam+: A Stochastic Method with Adaptive Variance Reduction",
+    "authors": [
+      "Mingrui Liu",
+      "Wei Zhang",
+      "Francesco Orabona",
+      "Tianbao Yang"
+    ],
+    "emails": [
+      "~Tianbao_Yang1",
+      "~Mingrui_Liu2",
+      "~Francesco_Orabona1",
+      "~Wei_Zhang33"
+    ],
+    "rank": 1883
+  },
+  {
+    "url": "https://openreview.net/forum?id=HMEiDPTOTmY",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Pre-trained contextualized language models (PrLMs) broadly use fine-grained tokens (words or sub-words) as minimal linguistic unit in pre-training phase. Introducing span-level information in pre-training has shown capable of further enhancing PrLMs. However, such methods require enormous resources and are lack of adaptivity due to huge computational requirement from pre-training. Instead of too early fixing the linguistic unit  input as nearly all previous work did, we propose a novel method that combines span-level information into the representations generated by PrLMs during fine-tuning phase for better flexibility. In this way, the modeling procedure of span-level texts can be more adaptive to different downstream tasks. In detail, we divide the sentence into several spans according to the segmentation generated by a pre-sampled dictionary. Based on the sub-token-level representation provided by PrLMs, we enhance the connection between the tokens in each span and gain a representation with enhanced span-level information. Experiments are conducted on GLUE benchmark and prove that our approach could remarkably enhance the performance of PrLMs in various natural language understanding tasks.",
+    "title": "Later Span Adaptation for Language Understanding",
+    "authors": [
+      "Rongzhou Bao",
+      "Zhuosheng Zhang",
+      "hai zhao"
+    ],
+    "emails": [
+      "~Rongzhou_Bao1",
+      "~Zhuosheng_Zhang1",
+      "~hai_zhao1"
+    ],
+    "rank": 1884
+  },
+  {
+    "url": "https://openreview.net/forum?id=1OQ90khuUGZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      6
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Training agents using Reinforcement Learning in games with sparse rewards is a challenging problem, since large amounts of exploration are required to retrieve even the first reward. To tackle this problem, a common approach is to use reward shaping to help exploration. However, an important drawback of reward shaping is that agents sometimes learn to optimize the shaped reward instead of the true objective. In this paper, we present a novel technique that we call action guidance that successfully trains agents to eventually optimize the true objective in games with sparse rewards while maintaining most of the sample efficiency that comes with reward shaping. We evaluate our approach in a simplified real-time strategy (RTS) game simulator called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D707 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03bc</mi></math></mjx-assistive-mml></mjx-container>RTS. ",
+    "title": "Action Guidance: Getting the Best of Sparse Rewards and Shaped Rewards for Real-time Strategy Games",
+    "authors": [
+      "Shengyi Huang",
+      "Santiago Ontanon"
+    ],
+    "emails": [
+      "~Shengyi_Huang1",
+      "~Santiago_Ontanon1"
+    ],
+    "rank": 1885
+  },
+  {
+    "url": "https://openreview.net/forum?id=2G9u-wu2tXP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      6
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Continual learning could shift the machine learning paradigm from data centric to model centric. A continual learning model needs to scale efficiently to handle semantically different datasets, while avoiding unnecessary growth. We introduce hash-routed convolutional neural networks: a group of convolutional units where data flows dynamically. Feature maps are compared using feature hashing and similar data is routed to the same units. A hash-routed network provides excellent plasticity thanks to its routed nature, while generating stable features through the use of orthogonal feature hashing. Each unit evolves separately and new units can be added (to be used only when necessary). Hash-routed networks achieve excellent performance across a variety of typical continual learning benchmarks without storing raw data and train using only gradient descent. Besides providing a continual learning framework for supervised tasks with encouraging results, our model can be used for unsupervised or reinforcement learning.",
+    "title": "Continual learning using hash-routed convolutional neural networks",
+    "authors": [
+      "Ahmad Berjaoui"
+    ],
+    "emails": [
+      "~Ahmad_Berjaoui1"
+    ],
+    "rank": 1886
+  },
+  {
+    "url": "https://openreview.net/forum?id=iEcqwosBEgx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      6
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We address the problem of seeking novel policies in reinforcement learning tasks. Instead of following the multi-objective framework commonly used in existing methods, we propose to rethink the problem under a novel perspective of constrained optimization. We at first introduce a new metric to evaluate the difference between policies, and then design two practical novel policy seeking methods following the new perspective, namely the Constrained Task Novel Bisector (CTNB), and the Interior Policy Differentiation (IPD), corresponding to the feasible direction method and the interior point method commonly known in the constrained optimization literature. Experimental comparisons on the MuJuCo control suite show our methods can achieve substantial improvements over previous novelty-seeking methods in terms of both the novelty of policies and their performances in the primal task.",
+    "title": "Novel Policy Seeking with Constrained Optimization",
+    "authors": [
+      "Hao Sun",
+      "Zhenghao Peng",
+      "Bo Dai",
+      "Jian Guo",
+      "Dahua Lin",
+      "Bolei Zhou"
+    ],
+    "emails": [
+      "~Hao_Sun3",
+      "~Zhenghao_Peng1",
+      "~Bolei_Zhou5",
+      "pcl.ac.cn",
+      "~Dahua_Lin1",
+      "~Bo_Dai2"
+    ],
+    "rank": 1887
+  },
+  {
+    "url": "https://openreview.net/forum?id=Jf24xdaAwF9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The ability for an agent to continuously learn new skills without catastrophically forgetting existing knowledge is of critical importance for the development of generally intelligent agents. Most methods devised to address this problem depend heavily on well-defined task boundaries which simplify the problem considerably. Our task-agnostic method, Self-Activating Neural Ensembles (SANE), uses a hierarchical modular architecture designed to avoid catastrophic forgetting without making any such assumptions. At each timestep a path through the SANE tree is activated; during training only activated nodes are updated, ensuring that unused nodes do not undergo catastrophic forgetting. Additionally, new nodes are created as needed, allowing the system to leverage and retain old skills while growing and learning new ones. We demonstrate our approach on MNIST and a set of grid world environments, demonstrating that SANE does not undergo catastrophic forgetting where existing methods do.",
+    "title": "Self-Activating Neural Ensembles for Continual Reinforcement Learning",
+    "authors": [
+      "Sam Powers",
+      "Abhinav Gupta"
+    ],
+    "emails": [
+      "~Abhinav_Gupta1",
+      "~Sam_Powers1"
+    ],
+    "rank": 1888
+  },
+  {
+    "url": "https://openreview.net/forum?id=L-88RyVtXGr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Recently,  inspired by repetitive block structure of modern ConvNets,  such as ResNets, parameter-sharing among repetitive convolution layers has been proposed to reduce the size of parameters. However, naive sharing of convolution filters poses many challenges such as overfitting and vanishing/exploding gradients, resulting in worse performance than non-shared counterpart models. Furthermore, sharing parameters often increases computational complexity due to additional operations for re-parameterization. In this work, we propose an efficient parameter-sharing structure and an effective training mechanism of deeply shared parameters. In the proposed ConvNet architecture, convolution layers are decomposed into a filter basis, that can be shared recursively, and layer-specific parts. We conjecture that a shared filter basis combined with a small amount of layer-specific parameters can retain, or further enhance, the representation power of individual layers, if a proper training method is applied. We show both theoretically and empirically that potential vanishing/exploding gradients problems can be mitigated by enforcing orthogonality to the shared filter bases. Experimental results demonstrate that our scheme effectively reduces redundancy by saving up to 63.8% of parameters while consistently outperforming non-shared counterpart networks even when a filter basis is deeply shared by up to 10 repetitive convolution layers.",
+    "title": "Learning Deeply Shared Filter Bases for Efficient ConvNets",
+    "authors": [
+      "Woochul Kang",
+      "Daeyeon Kim"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Woochul_Kang1"
+    ],
+    "rank": 1889
+  },
+  {
+    "url": "https://openreview.net/forum?id=oQyb8NrFzu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7,
+      5
+    ],
+    "rating": "4.93",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The technique of algorithmic stability has been used to capture the generalization power of several learning models, especially those trained with stochastic gradient descent (SGD). This paper investigates the tightness of the algorithmic stability bounds for SGD given by~\\cite{hardt2016train}. We show that the analysis of~\\cite{hardt2016train} is tight for convex objective functions, but loose for non-convex objective functions. In the non-convex case we provide a tighter upper bound on the stability (and hence generalization error), and provide evidence that it is asymptotically tight up to a constant factor. \nHowever, deep neural networks trained with SGD exhibit much better stability and generalization in practice than what is suggested by these (tight) bounds,\nnamely, linear or exponential degradation with time for SGD with constant step size. We aim towards characterizing deep learning loss functions with good generalization guarantees, despite being trained using SGD with constant step size.\n We propose the Hessian Contractive (HC) condition, which specifies the contractivity of regions containing local minima in the neural network loss landscape. We provide empirical evidence that this condition holds for several loss functions, and provide theoretical evidence that the known tight SGD stability bounds for convex and non-convex loss functions can be circumvented by HC loss functions, thus partially explaining the generalization of deep neural networks. ",
+    "title": "Revisiting the Stability of Stochastic Gradient Descent: A Tightness Analysis",
+    "authors": [
+      "Yikai Zhang",
+      "Samuel Bald",
+      "wenjia Zhang",
+      "Vamsi Pritham Pingali",
+      "Chao Chen",
+      "Mayank Goswami"
+    ],
+    "emails": [
+      "~Mayank_Goswami1",
+      "~Chao_Chen1",
+      "gmail.com",
+      "~Yikai_Zhang1",
+      "~Samuel_Bald1",
+      "rutgers.edu"
+    ],
+    "rank": 1890
+  },
+  {
+    "url": "https://openreview.net/forum?id=86t2GlfzFo",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      7,
+      3
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The curvature of the loss, provides rich information on the geometry underlying neural networks, with applications in second order optimisation and Bayesian deep learning. However, accessing curvature information is still a daunting engineering challenge, inaccessible to most practitioners. We hence provide a software package the \\textbf{Deep Curvature Suite}, which allows easy curvature evaluation for large modern neural networks. Beyond the calculation of a highly accurate moment matched approximation of the Hessian spectrum using Lanczos, our package provides: extensive \\emph{loss surface visualisation}, the calculation of the \\emph{Hessian variance} and \\emph{stochastic second order optimisers}. We further address and disprove many common misconceptions in the literature about the Lanczos algorithm, namely that it learns eigenvalues from the top down. We prove using high dimensional concentration inequalities that for specific matrices a single random vector is sufficient for accurate spectral estimation, informing our spectral visualisation method. We showcase our package practical utility on a series of examples based on realistic modern neural networks such as the VGG-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>16</mn></math></mjx-assistive-mml></mjx-container> and Preactivated ResNets on the CIFAR-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn></math></mjx-assistive-mml></mjx-container>/<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>100</mn></math></mjx-assistive-mml></mjx-container> datasets. We further detail <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn></math></mjx-assistive-mml></mjx-container> specific potential use cases enabled by our software: research in stochastic second order optimisation for deep learning, learning rate scheduling using known optimality formulae for convex surfaces and empirical verification of deep learning theory based on comparing empirical and theoretically implied spectra.",
+    "title": "Deep Curvature Suite",
+    "authors": [
+      "Diego Granziol",
+      "Xingchen Wan",
+      "Timur Garipov"
+    ],
+    "emails": [
+      "~Timur_Garipov1",
+      "~Diego_Granziol1",
+      "~Xingchen_Wan1"
+    ],
+    "rank": 1891
+  },
+  {
+    "url": "https://openreview.net/forum?id=dFBRrTMjlyL",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The problem of exploding and vanishing gradients has been a long-standing obstacle that hinders the effective training of neural networks. Despite various tricks and techniques that have been employed to alleviate the problem in practice, there still lacks satisfactory theories or provable solutions. In this paper, we address the problem from the perspective of high-dimensional probability theory. We provide a rigorous result that shows, under mild conditions, how the exploding/vanishing gradient problem disappears with high probability if the neural networks have sufficient width. Our main idea is to constrain both forward and backward signal propagation in a nonlinear neural network through a new class of activation functions, namely Gaussian-Poincar\u00e9 normalized functions, and orthogonal weight matrices. Experiments on both synthetic and real-world data validate our theory and confirm its effectiveness on very deep neural networks when applied in practice.",
+    "title": "Bidirectionally Self-Normalizing Neural Networks",
+    "authors": [
+      "Yao Lu",
+      "Stephen Gould",
+      "Thalaiyasingam Ajanthan"
+    ],
+    "emails": [
+      "~Stephen_Gould1",
+      "~Thalaiyasingam_Ajanthan1",
+      "anu.edu.au"
+    ],
+    "rank": 1892
+  },
+  {
+    "url": "https://openreview.net/forum?id=4vDf4Qtodh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.93",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we introduce InstantEmbedding, an efficient method for generating single-node representations using local PageRank computations.  We prove that our approach produces globally consistent representations in sublinear time. We demonstrate this empirically by conducting extensive experiments on real-world datasets with over a billion edges. Our experiments confirm that InstantEmbedding requires drastically less computation time (over 9,000 times faster) and less memory (by over 8,000 times) to produce a single node\u2019s embedding than traditional methods including DeepWalk, node2vec, VERSE, and FastRP. We also show that our method produces high quality representations, demonstrating results that meet or exceed the state of the art for unsupervised representation learning on tasks like node classification and link prediction.",
+    "title": "InstantEmbedding: Efficient Local Node Representations",
+    "authors": [
+      "Stefan Postavaru",
+      "Anton Tsitsulin",
+      "Filipe Miguel Goncalves de Almeida",
+      "Yingtao Tian",
+      "Silvio Lattanzi",
+      "Bryan Perozzi"
+    ],
+    "emails": [
+      "~Filipe_Miguel_Goncalves_de_Almeida1",
+      "~Anton_Tsitsulin1",
+      "~Yingtao_Tian1",
+      "~Bryan_Perozzi1",
+      "~Silvio_Lattanzi1",
+      "~Stefan_Postavaru1"
+    ],
+    "rank": 1893
+  },
+  {
+    "url": "https://openreview.net/forum?id=wabe-NE8-AX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      4,
+      5
+    ],
+    "rating": "4.92",
+    "confidences": [
+      4,
+      3,
+      3,
+      2
+    ],
+    "abstract": "Fisher Information Matrices (FIM) and Neural Tangent Kernels (NTK) are useful tools in a number of diverse applications related to neural networks. Yet these theoretical tools are often difficult to implement using current libraries for practical size networks, given that they require per-example gradients, and a large amount of memory since they scale as the number of parameters (for the FIM) or the number of examples x cardinality of the output space (for the NTK). NNGeometry is a PyTorch library that offers a simple interface for computing various linear algebra operations such as matrix-vector products, trace, frobenius norm, and so on, where the matrix is either the FIM or the NTK, leveraging recent advances in approximating these matrices. We here present the library and motivate our design choices, then we demonstrate it on actual deep neural networks.",
+    "title": "NNGeometry: Easy and Fast Fisher Information Matrices and Neural Tangent Kernels in PyTorch",
+    "authors": [
+      "Thomas George"
+    ],
+    "emails": [
+      "~Thomas_George2"
+    ],
+    "rank": 1894
+  },
+  {
+    "url": "https://openreview.net/forum?id=q_kZm9eHIeD",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.92",
+    "confidences": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We study risk-sensitive reinforcement learning with the entropic risk measure and function approximation. We consider the finite-horizon episodic MDP setting, and propose a meta algorithm based on value iteration. We then derive two algorithms for linear and general function approximation, namely RSVI.L and RSVI.G, respectively, as special instances of the meta algorithm. We illustrate that the success of RSVI.L depends crucially on carefully designed feature mapping and regularization that adapt to risk sensitivity. In addition, both RSVI.L and RSVI.G maintain risk-sensitive optimism that facilitates efficient exploration. On the analytic side, we provide regret analysis for the algorithms by developing a meta analytic framework, at the core of which is a risk-sensitive optimism condition. We show that any instance of the meta algorithm that satisfies the condition yields a meta regret bound. We further verify the condition for RSVI.L and RSVI.G under respective function approximation settings to obtain concrete regret bounds that scale sublinearly in the number of episodes. \n",
+    "title": "Entropic Risk-Sensitive Reinforcement Learning: A Meta Regret Framework with Function Approximation",
+    "authors": [
+      "Yingjie Fei",
+      "Zhuoran Yang",
+      "Zhaoran Wang"
+    ],
+    "emails": [
+      "~Zhuoran_Yang1",
+      "~Yingjie_Fei1",
+      "~Zhaoran_Wang1"
+    ],
+    "rank": 1895
+  },
+  {
+    "url": "https://openreview.net/forum?id=26WnoE4hjS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      5
+    ],
+    "rating": "4.92",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Catastrophic interference is common in many network-based learning systems, and many proposals exist for mitigating it. But, before we overcome interference we must understand it better. In this work, we first provide a definition and novel measure of interference for value-based control methods such as Fitted Q Iteration and DQN. We systematically evaluate our measure of interference, showing that it correlates with forgetting, across a variety of network architectures. Our new interference measure allows us to ask novel scientific questions about commonly used deep learning architectures and develop new learning algorithms. In particular we show that updates on the last layer result in significantly higher interference than updates internal to the network. Lastly, we introduce a novel online-aware representation learning algorithm to minimize interference, and we empirically demonstrate that it improves stability and has lower interference.",
+    "title": "Measuring and mitigating interference in reinforcement learning",
+    "authors": [
+      "Vincent Liu",
+      "Adam M White",
+      "Hengshuai Yao",
+      "Martha White"
+    ],
+    "emails": [
+      "~Adam_M_White1",
+      "~Vincent_Liu3",
+      "~Martha_White1",
+      "~Hengshuai_Yao2"
+    ],
+    "rank": 1896
+  },
+  {
+    "url": "https://openreview.net/forum?id=tuIt1aIb6Co",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.92",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "Unlike traditional supervised learning, in many settings only partial feedback is available. We may only observe outcomes for the chosen actions, but not the counterfactual outcomes associated with other  alternatives. Such settings encompass a wide variety of applications including pricing, online marketing and precision medicine. A key challenge is that observational data are influenced by historical policies deployed in the system, yielding a biased data distribution. We approach this task as a domain adaptation problem and propose a self-training algorithm which imputes outcomes for the unseen actions in the observational data to simulate a randomized trial. We offer a theoretical motivation for this approach by providing an upper bound on the generalization error defined on a randomized trial under the self-training objective. We empirically demonstrate the effectiveness of the proposed algorithms on both synthetic and real datasets.",
+    "title": "Counterfactual Self-Training",
+    "authors": [
+      "Ruijiang Gao",
+      "Max Biggs",
+      "Wei Sun",
+      "Ligong Han"
+    ],
+    "emails": [
+      "scarletmail.rutgers.edu",
+      "darden.virginia.edu",
+      "us.ibm.com",
+      "~Ruijiang_Gao2"
+    ],
+    "rank": 1897
+  },
+  {
+    "url": "https://openreview.net/forum?id=fGiKxvF-eub",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.92",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this work, we propose a sketching-based central path method for solving linear programmings, whose running time matches the state of art results [Cohen, Lee, Song STOC 19; Lee, Song, Zhang COLT 19]. Our method opens up the iterations of the central path method and deploys an \"iterate and sketch\" approach towards the problem by introducing a new coordinate-wise embedding technique, which may be of independent interest. Compare to previous methods, the work [Cohen, Lee, Song STOC 19] enjoys feasibility while being non-oblivious, and [Lee, Song, Zhang COLT 19] is oblivious but infeasible, and relies on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c64 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c65 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c6E TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c73 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c65 TEX-MI\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">d</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">e</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">n</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">s</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">e</mi></mrow></math></mjx-assistive-mml></mjx-container> sketching matrices such as subsampled randomized Hadamard/Fourier transform matrices. Our method enjoys the benefits of being both oblivious and feasible, and can use <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c73 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c70 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c61 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c72 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c73 TEX-MI\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c65 TEX-MI\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">s</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">p</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">a</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">r</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">s</mi><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">e</mi></mrow></math></mjx-assistive-mml></mjx-container> sketching matrix [Nelson, Nguyen FOCS 13] to speed up the online matrix-vector multiplication. Our framework for solving LP naturally generalizes to a broader class of convex optimization problems including empirical risk minimization.",
+    "title": "Oblivious Sketching-based Central Path Method for Solving Linear Programming Problems",
+    "authors": [
+      "Zhao Song",
+      "Zheng Yu"
+    ],
+    "emails": [
+      "~Zheng_Yu1",
+      "~Zhao_Song3"
+    ],
+    "rank": 1898
+  },
+  {
+    "url": "https://openreview.net/forum?id=qk0FE399OJ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.92",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Random-sampling Neural Architecture Search (RandomNAS) has recently become a prevailing NAS approach because of its search efficiency and simplicity. There are two main steps in RandomNAS: the training step that randomly samples the weight-sharing architectures from a supernet and iteratively updates their weights, and the search step that ranks architectures by their respective validation performance. Key to both steps is the assumption of a high correlation between estimated performance(i.e., accuracy) for weight-sharing architectures and their respective achievable accuracy (i.e., ground truth) when trained from scratch. We examine such a phenomenon via NASBench-201, whose ground truth is known for its entire NAS search space. We observe that existing RandomNAS can rank a set of architectures uniformly sampled from the entire global search space(GS), that correlates well with its ground-truth ranking. However, if we only focus on the top-performing architectures (such as top 20\\% according to the ground truth) in the GS, such a correlation drops dramatically.  This raises the question of whether we can find an effective proxy search space (PS) that is only a small subset of GS to dramatically improve RandomNAS\u2019s search efficiency while at the same time keeping a good correlation for the top-performing architectures. This paper proposes a new RandomNAS-based approach called EPS (Evolving the Proxy Search Space) to address this problem. We show that, when applied to NASBench-201, EPS can achieve near-optimal NAS performance and beat all existing state-of-the-art. When applied to different-variants of DARTS-like search spaces for tasks such as image classification and natural language processing, EPS is able to robustly achieve superior performance with shorter or similar search time compared to some leading NAS works. The code is available at <a href=\"https://github.com/IcLr2020SuBmIsSiOn/EPS\" target=\"_blank\" rel=\"nofollow\">https://github.com/IcLr2020SuBmIsSiOn/EPS</a>",
+    "title": "Improving Random-Sampling Neural Architecture Search by Evolving the Proxy Search Space",
+    "authors": [
+      "Yuhong Li",
+      "Cong Hao",
+      "Xiaofan Zhang",
+      "Jinjun Xiong",
+      "Wen-mei Hwu",
+      "Deming Chen"
+    ],
+    "emails": [
+      "~Wen-mei_Hwu1",
+      "~Yuhong_Li2",
+      "~Deming_Chen1",
+      "illinois.edu",
+      "~Jinjun_Xiong1"
+    ],
+    "rank": 1899
+  },
+  {
+    "url": "https://openreview.net/forum?id=nLktL9-M-C6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Batch Normalization (BN) as an important component assists Deep Neural Networks achieving promising performance for extensive learning tasks by scaling distribution of feature representations within mini-batches. However, the application of BN suffers from performance degradation under the scenario of Unsupervised Domain Adaptation (UDA), since the estimated statistics fail to concurrently describe two different domains. In this paper, we develop a novel normalization technique, named Collaborative Normalization (CoN), for eliminating domain discrepancy and accelerating the model training of neural networks for UDA. Unlike typical strategies only exploiting domain-specific statistics during normalization, our CoN excavates cross-domain knowledge and simultaneously scales features from various domains by mimicking the merits of collaborative representation. Our CoN can be easily plugged into popular neural network backbones for cross-domain learning. One the one hand, \\textbf{theoretical analysis} guarantees that models with CoN promote discriminability of feature representations and accelerate convergence rate; on the other hand, \\textbf{empirical study} verifies that replacing BN with CoN in popular network backbones effectively improves classification accuracy with \\textbf{4\\%} in most learning tasks across three cross-domain visual benchmarks.",
+    "title": "Collaborative Normalization for Unsupervised Domain Adaptation",
+    "authors": [
+      "Haifeng Xia",
+      "TAOTAO JING",
+      "Zhengming Ding"
+    ],
+    "emails": [
+      "~Zhengming_Ding4",
+      "~Haifeng_Xia2",
+      "~TAOTAO_JING1"
+    ],
+    "rank": 1900
+  },
+  {
+    "url": "https://openreview.net/forum?id=-757TnNDwIn",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.92",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Despite the empirical success of neural architecture search (NAS) in deep learning applications, the optimality, reproducibility and cost of NAS schemes remain hard to assess. The variation in search spaces adopted has further affected a fair comparison between search strategies. In this paper, we focus on search strategies in NAS and propose Generative Adversarial NAS (GA-NAS), promoting stable and reproducible neural architecture search. GA-NAS is theoretically inspired by importance sampling for rare event simulation, and iteratively refits a generator to previously discovered top architectures, thus increasingly focusing on important parts of the search space. We propose an efficient adversarial learning approach in GA-NAS, where the generator is not trained based on a large number of observations on architecture performance, but based on the relative prediction made by a discriminator, thus significantly reducing the number of evaluations required.\nExtensive experiments show that GA-NAS beats the best published results under several cases on the public NAS benchmarks including NAS-Bench-101, NAS-Bench-201, and NAS-Bench-301. We further show that GA-NAS can handle ad-hoc search constraints and search spaces. GA-NAS can find new architectures that enhance EfficientNet and ProxylessNAS in  terms of ImageNet Top-1 accuracy and/or the number of parameters by searching in their original search spaces.",
+    "title": "Generative Adversarial Neural Architecture Search with Importance Sampling",
+    "authors": [
+      "SEYED SAEED CHANGIZ REZAEI",
+      "Fred X. Han",
+      "Di Niu",
+      "Mohammad Salameh",
+      "Keith G Mills",
+      "Shangling Jui"
+    ],
+    "emails": [
+      "~Keith_G_Mills1",
+      "~Fred_X._Han1",
+      "huawei.com",
+      "ualberta.ca",
+      "~SEYED_SAEED_CHANGIZ_REZAEI1"
+    ],
+    "rank": 1901
+  },
+  {
+    "url": "https://openreview.net/forum?id=a9nIWs-Orh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4
+    ],
+    "rating": "4.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Transformer-based pre-trained language models have proven to be effective for learning contextualized language representation. However, current approaches only take advantage of the output of the encoder's final layer when fine-tuning the downstream tasks. We argue that only taking single layer's output restricts the power of pre-trained representation. Thus we deepen the representation learned by the model by fusing the hidden representation in terms of an explicit HIdden Representation Extractor (HIRE), which automatically absorbs the complementary representation with respect to the output from the final layer. Utilizing RoBERTa as the backbone encoder, our proposed improvement over the pre-trained models is shown effective on multiple natural language understanding tasks and help our model rival with the state-of-the-art models on the GLUE benchmark.",
+    "title": "Deepening Hidden Representations from Pre-trained Language Models",
+    "authors": [
+      "Junjie Yang",
+      "hai zhao"
+    ],
+    "emails": [
+      "~Junjie_Yang3",
+      "~hai_zhao1"
+    ],
+    "rank": 1902
+  },
+  {
+    "url": "https://openreview.net/forum?id=g75kUi1jAc_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5
+    ],
+    "rating": "4.92",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "In domains where data are sensitive or private, there is great value in methods that can learn in a distributed manner without the data ever leaving the local devices. In light of this need, federated learning has emerged as a popular training paradigm. However, many federated learning approaches trade transmitting data for communicating updated weight parameters for each local device. Therefore, a successful breach that would have otherwise directly compromised the data instead grants whitebox access to the local model, which opens the door to a number of attacks, including exposing the very data federated learning seeks to protect. Additionally, in distributed scenarios, individual client devices commonly exhibit high statistical heterogeneity. Many common federated approaches learn a single global model; while this may do well on average, performance degrades when the i.i.d. assumption is violated, underfitting individuals further from the mean and raising questions of fairness. To address these issues, we propose Weight Anonymized Factorization for Federated Learning (WAFFLe), an approach that combines the Indian Buffet Process with a shared dictionary of weight factors for neural networks. Experiments on MNIST, FashionMNIST, and CIFAR-10 demonstrate WAFFLe's significant improvement to local test performance and fairness while simultaneously providing an extra layer of security.",
+    "title": "WAFFLe: Weight Anonymized Factorization for Federated Learning",
+    "authors": [
+      "Weituo Hao",
+      "Nikhil Mehta",
+      "Kevin J Liang",
+      "Pengyu Cheng",
+      "Mostafa El-Khamy",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "~Mostafa_El-Khamy1",
+      "~Pengyu_Cheng1",
+      "~Kevin_J_Liang1",
+      "~Nikhil_Mehta1",
+      "~Lawrence_Carin2",
+      "~Weituo_Hao1"
+    ],
+    "rank": 1903
+  },
+  {
+    "url": "https://openreview.net/forum?id=BEs-Q1ggdwT",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4
+    ],
+    "rating": "4.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "In real-world decision-making problems, risk management is critical. Among various risk management approaches, the mean-variance criterion is one of the most widely used in practice. In this paper, we suggest expected quadratic utility maximization (EQUM) as a new framework for policy gradient style reinforcement learning (RL) algorithms with mean-variance control. The quadratic utility function is a common objective of risk management in finance and economics. The proposed EQUM framework has several interpretations, such as reward-constrained variance minimization and regularization, as well as agent utility maximization. In addition, the computation of the EQUM framework is easier than that of existing mean-variance RL methods, which require double sampling. In experiments, we demonstrate the effectiveness of the proposed framework in benchmark setting of RL and financial data.",
+    "title": "Policy Gradient with Expected Quadratic Utility Maximization: A New Mean-Variance Approach in Reinforcement Learning",
+    "authors": [
+      "Masahiro Kato",
+      "Kei Nakagawa"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Masahiro_Kato1"
+    ],
+    "rank": 1904
+  },
+  {
+    "url": "https://openreview.net/forum?id=EUUp9nWXsop",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.91",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Active learning (AL) prioritizes the labeling of the most informative data samples. However, the performance of AL heuristics depends on the structure of the underlying classifier model and the data. We propose an imitation learning scheme that imitates the selection of the best expert heuristic at each stage of the AL cycle in a batch-mode pool-based setting. We use DAGGER to train the policy on a dataset and later apply it to datasets from similar domains. With multiple AL heuristics as experts, the policy is able to reflect the choices of the best AL heuristics given the current state of the AL process. Our experiment on well-known datasets show that we both outperform state of the art imitation learners and heuristics.",
+    "title": "IALE: Imitating Active Learner Ensembles",
+    "authors": [
+      "Christoffer L\u00f6ffler",
+      "Christopher Mutschler"
+    ],
+    "emails": [
+      "~Christopher_Mutschler1",
+      "~Christoffer_L\u00f6ffler1"
+    ],
+    "rank": 1905
+  },
+  {
+    "url": "https://openreview.net/forum?id=NqWY3s0SILo",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5
+    ],
+    "rating": "4.91",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "In this paper, we propose Graph Optimized Neural Architecture Learning (GOAL), a novel gradient-based method for Neural Architecture Search (NAS), to find better architectures with fewer evaluated samples. Popular NAS methods usually employ black-box optimization based approaches like reinforcement learning, evolution algorithm or Bayesian optimization, which may be inefficient when having huge combinatorial NAS search spaces. In contrast, we aim to explicitly model the NAS search space as graphs, and then perform gradient-based optimization to learn graph structure with efficient exploitation. To this end, we learn a differentiable graph neural network as a surrogate model to rank candidate architectures, which enable us to obtain gradient w.r.t the input architectures. To cope with the difficulty in gradient-based optimization on the discrete graph structures, we propose to leverage proximal gradient descent to find potentially better architectures.\nOur empirical results show that GOAL outperforms mainstream black-box methods on existing NAS benchmarks in terms of search efficiency.",
+    "title": "Differentiable Graph Optimization for Neural Architecture Search",
+    "authors": [
+      "Chengyue Huang",
+      "Lingfei Wu",
+      "Yadong Ding",
+      "Siliang Tang",
+      "Fangli Xu",
+      "Chang Zong",
+      "Chilie Tan",
+      "Yueting Zhuang"
+    ],
+    "emails": [
+      "~Yueting_Zhuang1",
+      "~Lingfei_Wu1",
+      "yixue.us",
+      "zju.edu.cn",
+      "~Siliang_Tang1",
+      "~Chengyue_Huang1",
+      "~Yadong_Ding1",
+      "tongdun.net"
+    ],
+    "rank": 1906
+  },
+  {
+    "url": "https://openreview.net/forum?id=Peg7mkjzvyP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6
+    ],
+    "rating": "4.91",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Program translation contributes to many real world scenarios, such as porting codebases written in an obsolete or deprecated language to a modern one or re-implementing existing projects in one's preferred programming language. Existing data-driven approaches either require large amounts of training data or neglect significant characteristics of programs. In this paper, we present iPTR for interactive code translation retrieval from Big Code. iPTR uses a novel code representation technique that encodes structural characteristics of a program and a predictive transformation technique to transform the representation into the target programming language. The transformed representation is used for code retrieval from Big Code. With our succinct representation, the user can easily update and correct the returned results to improve the retrieval process. Our experiments show that iPTR outperforms supervised baselines in terms of program accuracy.",
+    "title": "iPTR: Learning a representation for interactive program translation retrieval",
+    "authors": [
+      "Binger Chen",
+      "Ziawasch Abedjan"
+    ],
+    "emails": [
+      "dbs.uni-hannover.de",
+      "~Binger_Chen1"
+    ],
+    "rank": 1907
+  },
+  {
+    "url": "https://openreview.net/forum?id=jnMjOctlfbZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.90",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "While a variety of methods have been developed for predicting molecular properties, deep learning networks that operate directly on three-dimensional molecular structure have recently demonstrated particular promise. In this work we present ATOM3D, a collection of both novel and existing datasets spanning several key classes of biomolecules, to systematically assess such learning methods. We develop three-dimensional molecular learning networks for each of these tasks, finding that they consistently improve performance relative to one- and two-dimensional methods. The specific choice of architecture proves to be critical for performance, with three-dimensional convolutional networks excelling at tasks involving complex geometries, while graph networks perform well on systems requiring detailed positional information. Furthermore, equivariant networks show significant promise but are currently unable to scale. Our results indicate many molecular problems stand to gain from three-dimensional molecular learning. All code and datasets are available at github.com/xxxxxxx/xxxxxx.",
+    "title": "ATOM3D: Tasks On Molecules in Three Dimensions",
+    "authors": [
+      "Raphael John Lamarre Townshend",
+      "Martin Vogele",
+      "Patricia Suriana",
+      "Alex Derry",
+      "Alex Powers",
+      "Yianni Laloudakis",
+      "Sidhika Balachandar",
+      "Brandon M Anderson",
+      "Stephan Eismann",
+      "Risi Kondor",
+      "Russ Altman",
+      "Ron O. Dror"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Brandon_M_Anderson1",
+      "~Stephan_Eismann1",
+      "~Risi_Kondor1",
+      "~Ron_O._Dror1",
+      "~Russ_Altman1",
+      "~Raphael_John_Lamarre_Townshend1"
+    ],
+    "rank": 1908
+  },
+  {
+    "url": "https://openreview.net/forum?id=X6YPReSv5CX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.90",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The concept of utilizing multi-step returns for updating value functions has been adopted in deep reinforcement learning (DRL) for a number of years. Updating value functions with different backup lengths provides advantages in different aspects, including bias and variance of value estimates, convergence speed, and exploration behavior of the agent. Conventional methods such as TD-lambda leverage these advantages by using a target value equivalent to an exponential average of different step returns. Nevertheless, integrating step returns into a single target sacrifices the diversity of the advantages offered by different step return targets. To address this issue, we propose Mixture Bootstrapped DQN (MB-DQN) built on top of bootstrapped DQN, and uses different backup lengths for different bootstrapped heads. MB-DQN enables heterogeneity of the target values that is unavailable in approaches relying only on a single target value. As a result, it is able to maintain the advantages offered by different backup lengths. In this paper, we first discuss the motivational insights through a simple maze environment. In order to validate the effectiveness of MB-DQN, we perform experiments on the Atari 2600 benchmark environments and demonstrate the performance improvement of MB-DQN over a number of baseline methods. We further provide a set of ablation studies to examine the impacts of different design configurations of MB-DQN.",
+    "title": "Mixture of Step Returns in Bootstrapped DQN",
+    "authors": [
+      "PoHan Chiang",
+      "Hsuan-Kung Yang",
+      "Zhang-Wei Hong",
+      "Chun-Yi Lee"
+    ],
+    "emails": [
+      "~Hsuan-Kung_Yang1",
+      "~Chun-Yi_Lee1",
+      "~Zhang-Wei_Hong1",
+      "~PoHan_Chiang2"
+    ],
+    "rank": 1909
+  },
+  {
+    "url": "https://openreview.net/forum?id=OJiM1R3jAtZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      3,
+      6
+    ],
+    "rating": "4.90",
+    "confidences": [
+      5,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Reinforcement learning provides an appealing formalism for learning control policies from experience. However, the classic active formulation of reinforcement learning necessitates a lengthy active exploration process for each behavior, making it difficult to apply in real-world settings. If we can instead allow reinforcement learning to effectively use previously collected data to aid the online learning process, where the data could be expert demonstrations or more generally any prior experience, we could make reinforcement learning a substantially more practical tool. While a number of recent methods have sought to learn offline from previously collected data, it remains exceptionally difficult to train a policy with offline data and improve it further with online reinforcement learning. In this paper we systematically analyze why this problem is so challenging, and propose an algorithm that combines sample-efficient dynamic programming with maximum likelihood policy updates, providing a simple and effective framework that is able to leverage large amounts of offline data and then quickly perform online fine-tuning of reinforcement learning policies. We show that our method enables rapid learning of skills with a combination of prior demonstration data and online experience across a suite of difficult dexterous manipulation and benchmark tasks.",
+    "title": "AWAC: Accelerating Online Reinforcement Learning with Offline Datasets",
+    "authors": [
+      "Ashvin Nair",
+      "Murtaza Dalal",
+      "Abhishek Gupta",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Sergey_Levine1",
+      "~Murtaza_Dalal1",
+      "~Abhishek_Gupta1",
+      "~Ashvin_Nair1"
+    ],
+    "rank": 1910
+  },
+  {
+    "url": "https://openreview.net/forum?id=pPqvD4E-3br",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.90",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "Neural networks trained on real-world datasets with long-tailed label distributions are biased towards frequent classes and perform poorly on infrequent classes. The imbalance in the ratio of positive and negative samples for each class skews network output probabilities further from ground-truth distributions. We propose a method, Partial Label Masking (PLM), which utilizes this ratio during training. By stochastically masking labels during loss computation, the method balances this ratio for each class, leading to improved recall on minority classes and improved precision on frequent classes. The ratio is estimated adaptively based on the network's performance by minimizing the KL divergence between predicted and ground-truth distributions. Whereas most existing approaches addressing data imbalance are mainly focused on single-label classification and do not generalize well to the multi-label case, this work proposes a general approach to solve the data imbalance issue for multi-label classification. PLM is versatile: it can be applied to most objective functions and it can be used alongside other strategies for class imbalance. Our method achieves strong performance when compared to existing methods on both multi-label (MultiMNIST and MSCOCO) and single-label (imbalanced CIFAR-10 and CIFAR-100) image classification datasets.",
+    "title": "PLM: Partial Label Masking for Imbalanced Multi-label Classification",
+    "authors": [
+      "Kevin Duarte",
+      "Yogesh S Rawat",
+      "Mubarak Shah"
+    ],
+    "emails": [
+      "~Yogesh_S_Rawat1",
+      "~Mubarak_Shah3",
+      "~Kevin_Duarte1"
+    ],
+    "rank": 1911
+  },
+  {
+    "url": "https://openreview.net/forum?id=7UyqgFhPqAd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5
+    ],
+    "rating": "4.90",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Neural networks are becoming increasingly popular in applications, but a comprehensive mathematical understanding of their potentials and limitations is still missing. In this paper, we study the prediction accuracies of neural networks from a statistical point of view. In particular, we establish statistical prediction guarantees for deep learning with different types of sparsity-inducing regularization. Our bounds feature a mild dependence on network widths and depths, and, therefore, support the current trend toward wide and deep networks. The tools that we use in our derivations are uncommon in deep learning and, hence, might be of additional interest.",
+    "title": "Connection- and Node-Sparse Deep Learning: Statistical Guarantees",
+    "authors": [
+      "Johannes Lederer"
+    ],
+    "emails": [
+      "~Johannes_Lederer1"
+    ],
+    "rank": 1912
+  },
+  {
+    "url": "https://openreview.net/forum?id=yxafu6ZtUux",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      7,
+      6
+    ],
+    "rating": "4.90",
+    "confidences": [
+      1,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Tech companies (e.g., Google or Facebook) often use randomized online experiments and/or A/B testing primarily based on the average treatment effects to compare their new product with an old one. However, it is also critically important to detect qualitative treatment effects such that the new one may significantly outperform the existing one only under some specific circumstances. The aim of this paper is to develop a powerful testing procedure to efficiently detect such qualitative treatment effects. We propose a scalable online updating algorithm to implement our test procedure. It has three novelties including adaptive randomization, sequential monitoring, and online updating with guaranteed type-I error control. We also thoroughly examine the theoretical proper- ties of our testing procedure including the limiting distribution of test statistics and the justification of an efficient bootstrap method. Extensive empirical studies are conducted to examine the finite sample performance of our test procedure. ",
+    "title": "AN ONLINE SEQUENTIAL TEST FOR QUALITATIVE TREATMENT EFFECTS",
+    "authors": [
+      "Chengchun Shi",
+      "Shikai Luo",
+      "Rui Song",
+      "Hongtu Zhu"
+    ],
+    "emails": [
+      "~Rui_Song2",
+      "~Chengchun_Shi1",
+      "~Hongtu_Zhu2",
+      "didiglobal.com"
+    ],
+    "rank": 1913
+  },
+  {
+    "url": "https://openreview.net/forum?id=IhUeMfEmexK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5
+    ],
+    "rating": "4.90",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Knowledge Distillation (KD) refers to transferring knowledge from a large model to a smaller one, which is widely used to enhance model performance in machine learning. It tries to align embedding spaces generated from the teacher and the student model (i.e. to make images corresponding to the same semantics share the same embedding across different models). In this work, we focus on its application in face recognition. We observe that existing knowledge distillation models optimize the proxy tasks that force the student to mimic the teacher\u2019s behavior, instead of directly optimizing the face recognition accuracy. Consequently, the obtained student models are not guaranteed to be optimal on the target task or able to benefit from advanced constraints, such as the large margin constraint (e.g. margin-based softmax).  We then propose a novel method named ProxylessKD that directly optimizes face recognition accuracy by inheriting the teacher's classifier as the student's classifier to guide the student to learn discriminative embeddings in the teacher's embedding space. The proposed ProxylessKD is very easy to implement and sufficiently generic to be extended to other tasks beyond face recognition.  We conduct extensive experiments on standard face recognition benchmarks,  \nand the results demonstrate that ProxylessKD achieves superior performance over existing knowledge distillation methods.",
+    "title": "ProxylessKD: Direct Knowledge Distillation with Inherited Classifier for Face Recognition",
+    "authors": [
+      "Weidong Shi",
+      "Guanghui Ren",
+      "Yunpeng Chen",
+      "Shuicheng Yan"
+    ],
+    "emails": [
+      "~Yunpeng_Chen1",
+      "~Guanghui_Ren1",
+      "~Shuicheng_Yan2",
+      "~Weidong_Shi2"
+    ],
+    "rank": 1914
+  },
+  {
+    "url": "https://openreview.net/forum?id=s9788-pPB2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5
+    ],
+    "rating": "4.89",
+    "confidences": [
+      4,
+      3,
+      2
+    ],
+    "abstract": "    While deep networks have produced state-of-the-art results in several domains from image classification to machine translation, hyper-parameter selection remains a significant computational bottleneck.  In order to produce the best possible model, practitioners often search across random seeds or use ensemble methods.  As models get larger, any method to improve neural network performance that involves re-training becomes intractable.  For example, computing the training accuracy of FixResNext-101 (829 million parameters) on ImageNet takes roughly 1~day when using 1~GPU.   \n    In this work, we present LLBoost, a theoretically-grounded, computationally-efficient method to boost the validation accuracy of pre-trained over-parameterized models without impacting the original training accuracy.  LLBoost adjusts the last layer of a neural network by adding a term that is orthogonal to the training feature matrix, which is constructed by applying all layers but the last to the training data.  We provide an efficient implementation of LLBoost on the GPU and demonstrate that LLBoost, run using only 1 GPU, improves the test/validation accuracy of pre-trained models on CIFAR10, ImageNet32, and ImageNet.  In the over-parameterized linear regression setting, we prove that LLBoost reduces the generalization error of any interpolating solution with high probability without affecting training error.  ",
+    "title": "LLBoost: Last Layer Perturbation to Boost Pre-trained Neural Networks",
+    "authors": [
+      "Adityanarayanan Radhakrishnan",
+      "Neha Prasad",
+      "Caroline Uhler"
+    ],
+    "emails": [
+      "~Adityanarayanan_Radhakrishnan1",
+      "~Caroline_Uhler1",
+      "mit.edu"
+    ],
+    "rank": 1915
+  },
+  {
+    "url": "https://openreview.net/forum?id=2NHl-ETnHxk",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      3,
+      6,
+      7
+    ],
+    "rating": "4.89",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "De-identification of magnetic resonance imagery (MRI) is intrinsically difficult since, even with all metadata removed, a person's face can easily be rendered and matched against a database. Existing de-identification methods tackle this task by obfuscating or removing parts of the face, but they either fail to reliably hide the patient's identity or they remove so much information that they adversely affect further analyses in the 3D space surrounding the face. In this work, we describe a new class of MRI de-identification techniques that remodel privacy-sensitive facial features as opposed to removing them.To accomplish this, we propose a conditional, multi-scale, 3D GAN architecture that takes a patient's MRI scan as input and generates a 3D volume in which the brain is not modified but the face has been de-identified. Compared to the classical removal-based techniques, our deep learning framework preserves privacy more reliably without adversely affecting downstream medical analyses on the brain, including segmentation and age prediction.",
+    "title": "Adversarial Privacy Preservation in MRI Scans of the Brain",
+    "authors": [
+      "Lennart Alexander Van der Goten",
+      "Tobias Hepp",
+      "Zeynep Akata",
+      "Kevin Smith"
+    ],
+    "emails": [
+      "~Lennart_Alexander_Van_der_Goten1",
+      "~Zeynep_Akata1",
+      "~Tobias_Hepp1",
+      "~Kevin_Smith1"
+    ],
+    "rank": 1916
+  },
+  {
+    "url": "https://openreview.net/forum?id=uFA24r7v4wL",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.89",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Graph Convolutional Networks (GCNs) have emerged as the state-of-the-art model for graph-based learning tasks. However, it is still challenging to train GCNs at scale, limiting their applications to real-world large graphs and hindering the exploration of deeper and more sophisticated GCN architectures. While it can be natural to leverage graph partition and distributed training for tackling this challenge, this direction has only been slightly touched on previously due to the unique challenge posed by the GCN structures, especially the excessive amount of boundary nodes in each partitioned subgraph, which can easily explode the required memory and communications for distributed training of GCNs. To this end, we propose BDS-GCN, a method that adopts unbiased boundary sampling strategy to enable efficient and scalable distributed GCN training while maintaining the full-graph accuracy. Empirical evaluations and ablation studies validate the effectiveness of the proposed BDS-GCN, e.g., boosting the throughput by up-to 500% and reducing the memory usage by up-to 58% for distributed GCN training, while achieving the same accuracy, as compared with the state-of-the-art methods. We believe our BDS-GCN would open up a new paradigm for enabling GCN training at scale. All code will be released publicly upon acceptance.",
+    "title": "BDS-GCN: Efficient Full-Graph Training of Graph Convolutional Nets with Partition-Parallelism and Boundary Sampling",
+    "authors": [
+      "Cheng Wan",
+      "Youjie Li",
+      "Nam Sung Kim",
+      "Yingyan Lin"
+    ],
+    "emails": [
+      "~Nam_Sung_Kim3",
+      "~Cheng_Wan2",
+      "~Yingyan_Lin1",
+      "~Youjie_Li1"
+    ],
+    "rank": 1917
+  },
+  {
+    "url": "https://openreview.net/forum?id=4artD3N3xB0",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.89",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Optimizing an objective function with uncertainty awareness is well-known to improve the accuracy and confidence of optimization solutions. Meanwhile, another relevant but very different question remains yet open: how to model and quantify the uncertainty of an optimization algorithm itself? To close such a gap, the prerequisite is to consider the optimizers as sampled from a distribution, rather than a few pre-defined and fixed update rules.  We first take the novel angle to consider the algorithmic space of optimizers, each being parameterized by a neural network. We then propose a Boltzmann-shaped posterior over this optimizer space, and approximate the posterior locally as Gaussian distributions through variational inference.  Our novel model, Bayesian learning to optimize (BL2O) is the first study to recognize and quantify the uncertainty of the optimization algorithm. Our experiments on optimizing test functions, energy functions in protein-protein interactions and loss functions in image classification and data privacy attack demonstrate that, compared to state-of-the-art methods, BL2O improves optimization and  uncertainty quantification (UQ) in aforementioned problems as well as calibration and out-of-domain detection in image classification.",
+    "title": "Bayesian Learning to Optimize: Quantifying the Optimizer Uncertainty",
+    "authors": [
+      "Yue Cao",
+      "Tianlong Chen",
+      "Zhangyang Wang",
+      "Yang Shen"
+    ],
+    "emails": [
+      "~Zhangyang_Wang1",
+      "~Yue_Cao4",
+      "~Yang_Shen4",
+      "~Tianlong_Chen1"
+    ],
+    "rank": 1918
+  },
+  {
+    "url": "https://openreview.net/forum?id=5UY7aZ_h37",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7,
+      5
+    ],
+    "rating": "4.88",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Having the right inductive biases can be crucial in many tasks or scenarios where data or computing resources are a limiting factor, or where training data is not perfectly representative of the conditions at test time. However, defining, designing, and efficiently adapting inductive biases is not necessarily straightforward. Inductive biases of a model affect its generalisation behaviour and influence the solution it converges to from different aspects. In this paper, we investigate the power of knowledge distillation in transferring the effects of inductive biases of a teacher model to a student model, when they have different architectures.\nWe consider different families of models: LSTMs vs. Transformers and CNNs vs. MLPs, in the context of tasks and scenarios with linguistics and vision applications, where having the right inductive biases is critical. We train our models in different setups: no knowledge distillation, self-distillation, and distillation using a teacher with a better inductive bias for the task at hand. We show that in the later setup, compared to no distillation and self-distillation, we can not only improve the performance of the students, but also the solutions they converge become similar to their teachers with respect to a wide range of properties, including different task-specific performance metrics, per sample behaviour of the models, representational similarity and how the representational space of the models evolve during training, performance on out-of-distribution datasets, confidence calibration, and finally whether the converged solutions fall within the same basins of attractions.",
+    "title": "Transferring Inductive Biases through Knowledge Distillation",
+    "authors": [
+      "Samira Abnar",
+      "Mostafa Dehghani",
+      "Willem H. Zuidema"
+    ],
+    "emails": [
+      "~Samira_Abnar1",
+      "~Mostafa_Dehghani1",
+      "~Willem_H._Zuidema1"
+    ],
+    "rank": 1919
+  },
+  {
+    "url": "https://openreview.net/forum?id=6R51jA4fOB",
+    "ratings": [
+      4,
+      7,
+      3,
+      5
+    ],
+    "rating": "4.88",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "This paper proposes a simple and effective method, Few-Shot GAN (FSGAN), for adapting GANs in few-shot settings (less than 100 images). FSGAN repurposes component analysis techniques, learning to adapt the singular values of the pre-trained weights while freezing the corresponding singular vectors. This provides a highly expressive parameter space for adaptation while constraining changes to the pretrained weights. We validate our method in a challenging few-shot setting of 5-100 images in the target domain. We show that our method has significant visual quality gains compared with existing GAN adapation methods. We report extensive qualitative and quantitative results showing the effectiveness of our method. We additionally highlight a problem for few-shot synthesis in the standard quantitative metric used by data-efficient image synthesis works.",
+    "title": "Few-shot Adaptation of Generative Adversarial Networks",
+    "authors": [
+      "Esther Robb",
+      "Wen-Sheng Chu",
+      "Abhishek Kumar",
+      "Jia-Bin Huang"
+    ],
+    "emails": [
+      "~Jia-Bin_Huang1",
+      "~Esther_Robb1",
+      "~Wen-Sheng_Chu1",
+      "~Abhishek_Kumar1"
+    ],
+    "rank": 1920
+  },
+  {
+    "url": "https://openreview.net/forum?id=jN8TTVCgOqf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.88",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Graph Neural Networks (GNNs), which benefit various real-world problems and applications, have emerged as a powerful technique for learning graph representations. The depth of a GNN model, denoted by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>, restricts the receptive field of a node to its <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>-hop neighbors and plays a subtle role in the performance of GNNs. Recent works demonstrate how different choices of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> produce a trade-off between increasing representation capacity and avoiding over-smoothing. We establish a theoretical connection between GNNs and local clustering, showing that short random-walks in GNNs have a high probability to be stuck at a local cluster. Based on the theoretical analysis, we propose Local Clustering Graph Neural Networks (LCGNN), a GNN learning paradigm that utilizes local clustering to efficiently search for small but compact subgraphs for GNN training and inference. Compared to full-batch GNNs, sampling-based GNNs and graph partition-based GNNs, LCGNN performs comparably or even better, achieving state-of-the-art results on four Open Graph Benchmark (OGB) datasets. The locality of LCGNN allows it to scale to graphs with 100M nodes and 1B edges on a single GPU.",
+    "title": "Local Clustering Graph Neural Networks",
+    "authors": [
+      "Jiezhong Qiu",
+      "Yukuo Cen",
+      "Qibin Chen",
+      "Chang Zhou",
+      "Jingren Zhou",
+      "Hongxia Yang",
+      "Jie Tang"
+    ],
+    "emails": [
+      "~Jingren_Zhou1",
+      "~Jie_Tang1",
+      "~Yukuo_Cen1",
+      "~Hongxia_Yang2",
+      "~Jiezhong_Qiu1",
+      "~Qibin_Chen1",
+      "~Chang_Zhou2"
+    ],
+    "rank": 1921
+  },
+  {
+    "url": "https://openreview.net/forum?id=B5bZp0m7jZd",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5
+    ],
+    "rating": "4.88",
+    "confidences": [
+      2,
+      3,
+      3
+    ],
+    "abstract": "The prioritized Experience Replay (ER) method has attracted great attention; however, there is little theoretical understanding of such prioritization strategy and why they help. In this work, we revisit prioritized ER and, in an ideal setting, show equivalence to minimizing cubic loss, providing theoretical insight into why it improves upon uniform sampling. This theoretical equivalence highlights two limitations of current prioritized experience replay methods: insufficient coverage of the sample space and outdated priorities of training samples. This motivates our model-based approach, which does not suffer from these limitations. Our key idea is to actively search for high priority states using gradient ascent. Under certain conditions, we prove that the hypothetical experiences generated from these states are sampled proportionally to approximately true priorities. We also characterize the distance between the sampling distribution of our method and the true prioritized sampling distribution. Our experiments on both benchmark and application-oriented domains show that our approach achieves superior performance over baselines. ",
+    "title": "Beyond Prioritized Replay: Sampling States in Model-Based RL via Simulated Priorities",
+    "authors": [
+      "Jincheng Mei",
+      "Yangchen Pan",
+      "Martha White",
+      "Amir-massoud Farahmand",
+      "Hengshuai Yao"
+    ],
+    "emails": [
+      "~Amir-massoud_Farahmand1",
+      "~Jincheng_Mei1",
+      "~Martha_White1",
+      "~Yangchen_Pan2",
+      "~Hengshuai_Yao2"
+    ],
+    "rank": 1922
+  },
+  {
+    "url": "https://openreview.net/forum?id=W1uVrPNO8Bw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      3
+    ],
+    "rating": "4.88",
+    "confidences": [
+      3,
+      3,
+      2
+    ],
+    "abstract": "A central ingredient in the impressive predictive performance of deep neural networks is optimization via stochastic gradient descent (SGD). While some theoretical progress has been made, the effect of SGD in neural networks is still unclear, especially during the early phase of training. Here we generalize the theory of thermophoresis from statistical mechanics and show that there exists an effective entropic force from SGD that pushes to reduce the gradient variance. We study this effect in detail in a simple two-layer model, where the thermophoretic force functions to decreases the weight norm and activation rate of the units. The strength of this effect is proportional to squared learning rate and inverse batch size, and is more effective during the early phase of training when the model's predictions are poor. Lastly we test our quantitative predictions with experiments on various models and datasets.\n",
+    "title": "Implicit Regularization of SGD via Thermophoresis",
+    "authors": [
+      "Mingwei Wei",
+      "David J. Schwab"
+    ],
+    "emails": [
+      "~David_J._Schwab1",
+      "~Mingwei_Wei1"
+    ],
+    "rank": 1923
+  },
+  {
+    "url": "https://openreview.net/forum?id=4Nt1F3qf9Gn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      4,
+      4
+    ],
+    "rating": "4.88",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "The healthcare industry generates troves of unlabelled physiological data. This data can be exploited via contrastive learning, a self-supervised pre-training method that encourages representations of instances to be similar to one another. We propose a family of contrastive learning methods, CLOCS, that encourages representations across space, time, \\textit{and} patients to be similar to one another. We show that CLOCS consistently outperforms the state-of-the-art methods, BYOL and SimCLR, when performing a linear evaluation of, and fine-tuning on, downstream tasks. We also show that CLOCS achieves strong generalization performance with only 25\\% of labelled training data. Furthermore, our training procedure naturally generates patient-specific representations that can be used to quantify patient-similarity. ",
+    "title": "CLOCS: Contrastive Learning of Cardiac Signals Across Space, Time, and Patients",
+    "authors": [
+      "Dani Kiyasseh",
+      "Tingting Zhu",
+      "David A. Clifton"
+    ],
+    "emails": [
+      "~Dani_Kiyasseh1",
+      "~David_A._Clifton1",
+      "eng.ox.ac.uk"
+    ],
+    "rank": 1924
+  },
+  {
+    "url": "https://openreview.net/forum?id=ohz3OEhVcs",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      6,
+      6
+    ],
+    "rating": "4.88",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent studies have indicated that Graph Convolutional Networks (GCNs) act as a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">low pass</mtext></math></mjx-assistive-mml></mjx-container> filter in spectral domain and encode smoothed node representations.  In this paper, we consider their opposite, namely Graph Deconvolutional Networks (GDNs) that reconstruct graph signals from smoothed node representations. We motivate the design of Graph Deconvolutional Networks via a combination of inverse filters in spectral domain and de-noising layers in wavelet domain, as the inverse operation results in a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">high pass</mtext></math></mjx-assistive-mml></mjx-container> filter and may amplify the noise.  Based on the proposed GDN, we further propose a graph autoencoder framework that first encodes smoothed graph representations with GCN and then decodes accurate graph signals with GDN.  We demonstrate the effectiveness of the proposed method on several tasks including unsupervised graph-level representation, social recommendation  and graph generation.",
+    "title": "Graph Autoencoders with Deconvolutional Networks",
+    "authors": [
+      "Jia Li",
+      "Jianwei Yu",
+      "Da-Cheng Juan",
+      "HAN Zhichao",
+      "Arjun Gopalan",
+      "Hong Cheng",
+      "Andrew Tomkins"
+    ],
+    "emails": [
+      "~HAN_Zhichao1",
+      "~Jia_Li4",
+      "~Hong_Cheng1",
+      "google.com",
+      "~Da-Cheng_Juan1",
+      "~Andrew_Tomkins2",
+      "se.cuhk.edu.hk"
+    ],
+    "rank": 1925
+  },
+  {
+    "url": "https://openreview.net/forum?id=u846Bqhry_",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      6
+    ],
+    "rating": "4.88",
+    "confidences": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "This work explores deep learning based classification model on real-world datasets with a long-tailed distribution. Most of previous works deal with the long-tailed classification problem by re-balancing the overall distribution within the whole dataset or directly transferring knowledge from data-rich classes to data-poor ones. In this work, we consider the gradient distortion in long-tailed classification when the gradient on data-rich classes and data-poor ones are incorporated simultaneously, i.e., shifted gradient direction towards data-rich classes as well as the enlarged variance by the gradient fluctuation on data-poor classes. Motivated by such phenomenon, we propose to disentangle the distinctive effects of data-rich and data-poor gradient and asynchronously train a model via a dual-phase learning process. The first phase only concerns the data-rich classes. In the second phase, besides the standard classification upon data-poor classes, we propose an exemplar memory bank to reserve representative examples and a memory-retentive loss via graph matching to retain the relation between two phases. The extensive experimental results on four commonly used long-tailed benchmarks including CIFAR100-LT, Places-LT, ImageNet-LT and iNaturalist 2018 highlight the excellent performance of our proposed method.",
+    "title": "Asynchronous Modeling: A Dual-phase Perspective for Long-Tailed Recognition",
+    "authors": [
+      "Hu Zhang",
+      "Linchao Zhu",
+      "Yi Yang"
+    ],
+    "emails": [
+      "~Hu_Zhang1",
+      "~Linchao_Zhu1",
+      "~Yi_Yang4"
+    ],
+    "rank": 1926
+  },
+  {
+    "url": "https://openreview.net/forum?id=2nm0fGwWBMr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.88",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Learning unsupervised node embeddings facilitates several downstream tasks such as node classification and link prediction. A node embedding is universal if it is designed to be used by and benefit various downstream tasks. This work introduces PanRep, a graph neural network (GNN) model, for unsupervised learning of universal node representations for heterogenous graphs. PanRep consists of a GNN encoder that obtains node embeddings and four decoders, each capturing different topological and node feature properties. Abiding to these properties the novel unsupervised framework learns universal embeddings applicable to different downstream tasks. PanRep can be furthered fine-tuned to account for possible limited labels. In this operational setting PanRep is considered as a pretrained model for extracting node embeddings of heterogenous graph data. PanRep outperforms all unsupervised and certain supervised methods in node classification and link prediction, especially when the labeled data for the supervised methods is small. PanRep-FT (with fine-tuning) outperforms all other supervised approaches, which corroborates the merits of pretraining models. Finally, we apply PanRep-FT for discovering novel drugs for Covid-19. We showcase the advantage of universal embeddings in drug repurposing and identify several drugs used in clinical trials as possible drug candidates.",
+    "title": "PanRep: Universal node embeddings for heterogeneous graphs",
+    "authors": [
+      "Vassilis N. Ioannidis",
+      "Da Zheng",
+      "George Karypis"
+    ],
+    "emails": [
+      "~George_Karypis1",
+      "amazon.com",
+      "~Vassilis_N._Ioannidis1"
+    ],
+    "rank": 1927
+  },
+  {
+    "url": "https://openreview.net/forum?id=TG93-JNjMr4",
+    "ratings": [
+      5,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.88",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Car-focused navigation services are based on turns and distances of named streets, whereas navigation instructions naturally used by humans are centered around physical objects called landmarks. We present a neural model that takes OpenStreetMap representations as input and learns to generate navigation instructions that contain visible and salient landmarks from human natural language instructions. Routes on the map are encoded in a location- and rotation-invariant graph representation that is decoded into natural language instructions. Our work is based on a novel dataset of 7,672 crowd-sourced instances that have been verified by human navigation in Street View. Our evaluation shows that the navigation instructions generated by our system have similar properties as human-generated instructions, and lead to successful human navigation in Street View.",
+    "title": "Generating Landmark Navigation Instructions from Maps as a Graph-to-Text Problem",
+    "authors": [
+      "Raphael Schumann",
+      "Stefan Riezler"
+    ],
+    "emails": [
+      "~Raphael_Schumann2",
+      "~Stefan_Riezler1"
+    ],
+    "rank": 1928
+  },
+  {
+    "url": "https://openreview.net/forum?id=sebtMY-TrXh",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.88",
+    "confidences": [
+      2,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Mapping sequences of discrete data to a point in a continuous space makes it difficult to retrieve those sequences via random sampling. Mapping the input to a volume would make it easier to retrieve at test time, and that is the strategy followed by the family of approaches based on Variational Autoencoder. However the fact that they are at the same time optimizing for prediction and for smoothness of representation, forces them to trade-off between the two. We benchmark the performance of some of the standard methods in deep learning to generate sentences by uniformly sampling a continuous space. We do it by proposing AriEL, that constructs volumes in a continuous space, without the need of encouraging the creation of volumes through the loss function. We first benchmark on a toy grammar, that allows to automatically evaluate the language learned and generated by the models. Then, we benchmark on a real dataset of human dialogues. Our results indicate that the random access to the stored information can be significantly improved, since our method AriEL is able to generate a wider variety of correct language by randomly sampling the latent space. VAE follows in performance for the toy dataset while, AE and Transformer follow for the real dataset. This partially supports the hypothesis that encoding information into volumes instead of into points, leads to improved retrieval of learned information with random sampling. We hope this analysis can clarify directions to lead to better generators.",
+    "title": "AriEL: Volume Coding for Sentence Generation Comparisons",
+    "authors": [
+      "Luca Celotti",
+      "Simon Brodeur",
+      "Jean Rouat"
+    ],
+    "emails": [
+      "~Simon_Brodeur1",
+      "~Luca_Celotti1",
+      "~Jean_Rouat1"
+    ],
+    "rank": 1929
+  },
+  {
+    "url": "https://openreview.net/forum?id=48goXfYCVFX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      5,
+      3
+    ],
+    "rating": "4.88",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Supporting chefs with ingredient recommender systems to create new recipes is challenging, as good ingredient combinations depend on many factors like taste, smell, cuisine style, texture among others. There have been few attempts to address these issues using machine     learning. Importantly, useful models do obviously need to be accurate but importantly -- especially for food professionals -- interpretable. In order to address these issues, we propose the Interpretable Relational Representation Model (IRRM). The main component of the model is a key-value memory network to represent relationships of ingredients. We propose and test two variants of the model.\nOne can learn latent relational representations over a trainable memory network (Implicit model), and the other can learn explainable relational representations over a pre-trained memory network that integrates an external knowledge base (Explicit model).\nThe relational representations resulting from the model are interpretable -- they allow to inspect why certain ingredient pairings have been suggested. The Explicit model additionally allows to integrate any number of manually specified constraints.\nWe conduct experiments on two recipe datasets, including CulinaryDB with 45,772 recipes and Flavornet with 55,001 recipes, respectively. The experimental results show that our models are both predictive and informative.",
+    "title": "Interpretable Relational Representations for Food Ingredient Recommendation Systems",
+    "authors": [
+      "Kana Maruyama",
+      "Michael Spranger"
+    ],
+    "emails": [
+      "~Michael_Spranger2",
+      "~Kana_Maruyama1"
+    ],
+    "rank": 1930
+  },
+  {
+    "url": "https://openreview.net/forum?id=defQ1AG6IWn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.88",
+    "confidences": [
+      3,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Unsupervised domain adaptation (UDA) is to make predictions for unlabeled data in a target domain with labeled data from source domain available. Recent advances exploit entropy minimization and self-training to align the feature of two domains. However, as decision boundary is largely biased towards source data, class-wise pseudo labels generated by target predictions are usually very noisy, and trusting those noisy supervisions might potentially deteriorate the intrinsic target discriminative feature. Motivated by agglomerative clustering which assumes that features in the near neighborhood should be clustered together, we observe that target features from source pre-trained model are highly intrinsic discriminative and have a high probability of sharing the same label with their neighbors. Based on those observations, we propose a simple but effective method to impose Neighbor Class Consistency on target features to preserve and further strengthen the intrinsic discriminative nature of target data while regularizing the unified classifier less biased towards source data. We also introduce an entropy-based weighting scheme to help our framework more robust to the potential noisy neighbor supervision. We conduct ablation studies and extensive experiments on three UDA image classification benchmarks. Our method outperforms all existing UDA state-of-the-art. ",
+    "title": "Neighbor Class Consistency on Unsupervised Domain Adaptation",
+    "authors": [
+      "Chang Liu",
+      "Kai Li",
+      "Yun Fu"
+    ],
+    "emails": [
+      "northeastern.edu",
+      "~Kai_Li3",
+      "~Yun_Fu1"
+    ],
+    "rank": 1931
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q9U_H8lQ4yV",
+    "ratings": [
+      4,
+      5,
+      5,
+      6
+    ],
+    "rating": "4.88",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "A neural multimodal machine translation (MMT) system is one that aims to perform better translation by extending conventional text-only translation models with multimodal information. Many recent studies report improvements when equipping their models with the multimodal module, despite the controversy whether such improvements indeed come from the multimodal part. We revisit the recent development of neural multimodal machine translation by proposing two \\textit{interpretable} MMT models that achieve new state-of-the-art results on the standard \\dataset\\ dataset. To our surprise, however, while we observe similar gains as in the recent developed multimodal-integrated models, our models learn to \\textit{ignore} the multimodal information. Upon further investigation, we discover that the improvements bought about by the multimodal models over text-only counterpart are in fact results of the regularization effect. We report our empirical findings which express the importance of MMT models' interpretability and set new paradigms for future MMT research.",
+    "title": "Good for Misconceived Reasons: Revisiting Neural Multimodal Machine Translation",
+    "authors": [
+      "Zhiyong Wu",
+      "Lingpeng Kong",
+      "Ben Kao"
+    ],
+    "emails": [
+      "~Ben_Kao1",
+      "~Lingpeng_Kong1",
+      "~Zhiyong_Wu3"
+    ],
+    "rank": 1932
+  },
+  {
+    "url": "https://openreview.net/forum?id=ox8wgFpoyHc",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      6
+    ],
+    "rating": "4.88",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Undertaking causal inference with observational data is extremely useful across a wide range of domains including the development of medical treatments, advertisements and marketing, and policy making. There are two main challenges associated with undertaking causal inference using observational data: treatment assignment heterogeneity (i.e., differences between the treated and untreated groups), and an absence of counterfactual data (i.e., not knowing what would have happened if an individual who did get treatment, were instead to have not been treated). We address these two challenges by combining structured inference and targeted learning. To our knowledge, Targeted Variational AutoEncoder (TVAE) is the first method to incorporate targeted learning into deep latent variable models. Results demonstrate competitive and state of the art performance.\n",
+    "title": "Targeted VAE: Structured Inference and Targeted Learning for Causal Parameter Estimation",
+    "authors": [
+      "Matthew James Vowels",
+      "Necati Cihan Camgoz",
+      "Richard Bowden"
+    ],
+    "emails": [
+      "~Matthew_James_Vowels1",
+      "~Richard_Bowden1",
+      "~Necati_Cihan_Camgoz1"
+    ],
+    "rank": 1933
+  },
+  {
+    "url": "https://openreview.net/forum?id=pT9LLoqwiT9",
+    "ratings": [
+      5,
+      6,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.88",
+    "confidences": [
+      4,
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Amodal segmentation in biological vision refers to the perception of the entire object when only a fraction is visible. This ability of seeing through occluders and reasoning about occlusion is innate to biological vision but not adequately modeled in current machine vision approaches. A key challenge is that ground-truth supervisions of amodal object segmentation are inherently difficult to obtain. In this paper, we present a neural network architecture that is capable of amodal perception, when weakly supervised with standard (modal) bounding box annotations. Our model extends compositional convolutional neural networks (CompositionalNets), which have been shown to be robust to partial occlusion by explicitly representing objects as composition of parts. In particular, we extend CompositionalNets by: 1) Expanding the innate part-voting mechanism in the CompositionalNets to perform instance segmentation; 2) and by exploiting the internal representations of CompositionalNets to enable amodal completion for both bounding box and segmentation mask. Our extensive experiments show that our proposed model can segment amodal masks robustly, with much improved mask prediction qualities compared to state-of-the-art amodal segmentation approaches.",
+    "title": "Weakly-Supervised Amodal Instance Segmentation with Compositional Priors",
+    "authors": [
+      "Yihong Sun",
+      "Adam Kortylewski",
+      "Alan Yuille"
+    ],
+    "emails": [
+      "~Alan_Yuille1",
+      "~Adam_Kortylewski1",
+      "~Yihong_Sun1"
+    ],
+    "rank": 1934
+  },
+  {
+    "url": "https://openreview.net/forum?id=nlWgE3A-iS",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.87",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Deep Reinforcement Learning (DRL) can distill behavioural policies from sensory input that solve complex tasks, however, the policies tend to be task-specific and sample inefficient, requiring a large number of interactions with the environment that may be costly or impractical for many real world applications. Model-based DRL (MBRL) can allow learned behaviours and dynamics from one task to be translated to a new task in a related environment, but still suffer from low sample efficiency. In this work we introduce ReaPER, an algorithm that addresses the sample efficiency challenge in model-based DRL, we illustrate the power of the proposed solution on the DeepMind Control benchmark. Our improvements are driven by sparse , self-supervised, contrastive model representations and efficient use of past experience. We empirically analyze each novel  component of ReaPER and analyze how they contribute to sample efficiency. We also illustrate how other standard alternatives fail to improve upon previous methods. Code will be made available.",
+    "title": "ReaPER: Improving Sample Efficiency in Model-Based Latent Imagination",
+    "authors": [
+      "Martin A Bertran",
+      "Guillermo Sapiro",
+      "mariano phielipp"
+    ],
+    "emails": [
+      "~mariano_phielipp1",
+      "~Guillermo_Sapiro1",
+      "~Martin_A_Bertran1"
+    ],
+    "rank": 1935
+  },
+  {
+    "url": "https://openreview.net/forum?id=K398CuAKVKB",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.87",
+    "confidences": [
+      4,
+      2,
+      4,
+      5
+    ],
+    "abstract": "It has been shown that the core reasons that complex and hypercomplex valued neural networks offer improvements over their real-valued counterparts is the fact that aspects of their algebra forces treating multi-dimensional data as a single entity (forced local relationship encoding) with an added benefit of reducing parameter count via weight sharing. However, both are constrained to a set number of dimensions, two for complex and four for quaternions. These observations motivate us to introduce novel vector map convolutions which capture both of these properties provided by complex/hypercomplex convolutions, while dropping the unnatural dimensionality constraints their algebra imposes. This is achieved by introducing a system that mimics the unique linear combination of input dimensions via the Hamilton product using a permutation function, as well as batch normalization and weight initialization for the system. We perform three experiments using three different network architectures to show that these novel vector map convolutions seem to capture all the benefits of complex and hyper-complex networks, such as their ability to capture internal latent relations, while avoiding the dimensionality restriction.",
+    "title": "Removing Dimensional Restrictions on Complex/Hyper-complex Convolutions",
+    "authors": [
+      "Chase John Gaudet",
+      "Anthony S. Maida"
+    ],
+    "emails": [
+      "~Anthony_S._Maida1",
+      "~Chase_John_Gaudet1"
+    ],
+    "rank": 1936
+  },
+  {
+    "url": "https://openreview.net/forum?id=CHTHamtufWN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.87",
+    "confidences": [
+      2,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Empirical risk minimization can lead to poor generalization behaviour on unseen environments if the learned model does not capture invariant feature represen- tations. Invariant risk minimization (IRM) is a recent proposal for discovering environment-invariant representations. It was introduced by Arjovsky et al. (2019) and extended by Ahuja et al. (2020). The assumption of IRM is that all environ- ments are available to the learning system at the same time. With this work, we generalize the concept of IRM to scenarios where environments are observed se- quentially. We show that existing approaches, including those designed for contin- ual learning, fail to identify the invariant features and models across sequentially presented environments. We extend IRM under a variational Bayesian and bilevel framework, creating a general approach to continual invariant risk minimization. We also describe a strategy to solve the optimization problems using a variant of the alternating direction method of multiplier (ADMM). We show empirically us- ing multiple datasets and with multiple sequential environments that the proposed methods outperforms or is competitive with prior approaches.",
+    "title": "Continual Invariant Risk Minimization",
+    "authors": [
+      "Francesco Alesiani",
+      "Shujian Yu",
+      "Mathias Niepert"
+    ],
+    "emails": [
+      "~Francesco_Alesiani1",
+      "~Mathias_Niepert1",
+      "~Shujian_Yu1"
+    ],
+    "rank": 1937
+  },
+  {
+    "url": "https://openreview.net/forum?id=tLRxBLoTAM",
+    "ratings": [
+      5,
+      3,
+      9,
+      3
+    ],
+    "rating": "4.87",
+    "confidences": [
+      2,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We present a method for making predictions using neural networks that, at the test time, is robust against shifts from the training data distribution. The proposed method is based on making \\emph{one prediction via different cues} (called middle domains) and ensembling their outputs into one strong prediction. The premise of the idea is that predictions via different cues respond differently to distribution shifts, hence one can merge them into one robust final prediction, if ensembling can be done successfully. We perform the ensembling in a straightforward but principled probabilistic manner. The evaluations are performed using multiple vision dataset under a range of natural and synthetic distribution shifts which demonstrate the proposed method is considerably more robust compared to its standard learning counterpart, conventional ensembles, and several other baselines.",
+    "title": "Robustness via Probabilistic Cross-Task Ensembles",
+    "authors": [
+      "Teresa Yeo",
+      "Oguzhan Fatih Kar",
+      "Amir Zamir"
+    ],
+    "emails": [
+      "~Oguzhan_Fatih_Kar1",
+      "~Teresa_Yeo1",
+      "~Amir_Zamir1"
+    ],
+    "rank": 1938
+  },
+  {
+    "url": "https://openreview.net/forum?id=LQTsVy-Xli",
+    "ratings": [
+      4,
+      4,
+      7,
+      5
+    ],
+    "rating": "4.87",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Few-shot learning aims to recognize new classes with few annotated instances within each category. Recently, metric-based meta-learning approaches have shown the superior performance in tackling few-shot learning problems. Despite their success, existing metric-based few-shot approaches often fail to push the fine-grained sub-categories apart in the embedding space given no fine-grained labels. This may result in poor generalization to fine-grained sub-categories, and thus affects model interpretation. To alleviate this problem, we introduce contrastive loss into few-shot classification for learning latent fine-grained structure in the embedding space. Furthermore, to overcome the drawbacks of random image transformation used in current contrastive learning in producing noisy and inaccurate image pairs (i.e., views), we develop a learning-to-learn algorithm to automatically generate different views of the same image. Extensive experiments on standard few-shot learning benchmarks and few-shot fine-grained image classification demonstrate the superiority of our method. ",
+    "title": "Auto-view contrastive learning for few-shot image recognition",
+    "authors": [
+      "Xu Luo",
+      "Yuxuan Chen",
+      "Liangjian Wen",
+      "Lili Pan",
+      "Zenglin Xu"
+    ],
+    "emails": [
+      "~Lili_Pan2",
+      "~Zenglin_Xu1",
+      "~Liangjian_Wen1",
+      "~Xu_Luo1",
+      "~Yuxuan_Chen2"
+    ],
+    "rank": 1939
+  },
+  {
+    "url": "https://openreview.net/forum?id=pWipslK5xVf",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      6,
+      5
+    ],
+    "rating": "4.87",
+    "confidences": [
+      3,
+      4,
+      3,
+      2,
+      3
+    ],
+    "abstract": "We propose a novel Wasserstein distributional normalization (WDN) algorithm to handle noisy labels for accurate classification.  In this paper, we split our data into uncertain and certain samples based on small loss criteria. We investigate the geometric relationship between these two different types of samples and enhance this relation to exploit useful information, even from uncertain samples.  To this end, we impose geometric constraints on the uncertain samples by normalizing them into the Wasserstein ball centered on certain samples. Experimental results demonstrate that our WDN outperforms other state-of-the-art methods on the Clothing1M and CIFAR-10/100 datasets, which have diverse noisy labels. The proposed WDN is highly compatible with existing classification methods, meaning it can be easily plugged into various methods to improve their accuracy significantly. ",
+    "title": "Wasserstein Distributional Normalization : Nonparametric Stochastic Modeling for Handling Noisy Labels",
+    "authors": [
+      "Sung Woo Park",
+      "Junseok Kwon"
+    ],
+    "emails": [
+      "~Sung_Woo_Park2",
+      "~Junseok_Kwon5"
+    ],
+    "rank": 1940
+  },
+  {
+    "url": "https://openreview.net/forum?id=_sSHg203jSu",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      6
+    ],
+    "rating": "4.87",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The representations learned by large-scale NLP models such as BERT have been widely used in various tasks. However, the increasing model size of the pre-trained models also brings the efficiency challenges, including the inference speed and the model size when deploying the model on devices. Specifically, most operations in BERT consist of matrix multiplications. These matrices are not low-rank and thus canonical matrix decomposition could not find an efficient approximation. In this paper, we observe that the learned representation of each layer lies in a low-dimensional space. Based on this observation, we propose DRONE (data-aware low-rank compression), a provably optimal low-rank decomposition of weight matrices, which has a simple closed form solution that can be efficiently computed. DRONE is generic, could be applied to both fully-connected and self-attention layers, and does not require any fine-tuning or distillation steps.  Experimental results show that DRONE could improve both model size and inference speed with limited loss of accuracy. Specifically, DRONE alone achieves 1.92x faster on MRPC task with only 1.5%loss of accuracy, and when combined with distillation, DRONE achieves over 12.3x faster on various natural language inference tasks.",
+    "title": "Data-aware Low-Rank Compression for Large NLP Models",
+    "authors": [
+      "Patrick CHen",
+      "Hsiang-Fu Yu",
+      "Inderjit S Dhillon",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Patrick_CHen1",
+      "~Cho-Jui_Hsieh1",
+      "gmail.com",
+      "~Inderjit_S_Dhillon1"
+    ],
+    "rank": 1941
+  },
+  {
+    "url": "https://openreview.net/forum?id=cgRzg1V9su",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      5,
+      4
+    ],
+    "rating": "4.87",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We introduce Inner Ensemble Networks (IENs) which reduce the variance within the neural network itself without an increase in the model complexity. IENs utilize ensemble parameters during the training phase to reduce the network variance. While in the testing phase, these parameters are removed without a change in the enhanced performance. IENs reduce the variance of an ordinary deep model by a factor of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><msup><mi>m</mi><mrow><mi>L</mi><mo>\u2212</mo><mn>1</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi></math></mjx-assistive-mml></mjx-container> is the number of inner ensembles and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> is the depth of the model. Also, we show empirically and theoretically that IENs lead to a greater variance reduction in comparison with other similar approaches such as dropout and maxout. Our results show a decrease of error rates between 1.7\\% and 17.3\\% in comparison with an ordinary deep model. We also show that IEN was preferred by Neural Architecture Search (NAS) methods over prior approaches.",
+    "title": "Inner Ensemble Networks: Average Ensemble as an Effective Regularizer",
+    "authors": [
+      "Abduallah Mohamed",
+      "Muhammed Mohaimin Sadiq",
+      "Ehab AlBadawy",
+      "Mohamed Elhoseiny",
+      "Christian Claudel"
+    ],
+    "emails": [
+      "~Abduallah_Mohamed1",
+      "~Christian_Claudel1",
+      "albany.edu",
+      "utexas.edu",
+      "~Mohamed_Elhoseiny1"
+    ],
+    "rank": 1942
+  },
+  {
+    "url": "https://openreview.net/forum?id=GA87kjyd-f",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.86",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "A number of recent approaches have been proposed for pruning neural network parameters at initialization with the goal of reducing the size and computational burden of models while minimally affecting their training dynamics and generalization performance. While each of these approaches have some amount of well-founded motivation, a rigorous analysis of the effect of these pruning methods on network training dynamics and their formal relationship to each other has thus far received little attention. Leveraging recent theoretical approximations provided by the Neural Tangent Kernel, we unify a number of popular approaches for pruning at initialization under a single path-centric framework. We introduce the Path Kernel as the data-independent factor in a decomposition of the Neural Tangent Kernel and show the global structure of the Path Kernel can be computed efficiently. This Path Kernel decomposition separates the architectural effects from the data-dependent effects within the Neural Tangent Kernel, providing a means to predict the convergence dynamics of a network from its architecture alone. We analyze the use of this structure in approximating training and generalization performance of networks in the absence of data across a number of initialization pruning approaches. Observing the relationship between input data and paths and the relationship between the Path Kernel and its natural norm, we additionally propose two augmentations of the SynFlow algorithm for pruning at initialization.",
+    "title": "A Unified Paths Perspective for Pruning at Initialization",
+    "authors": [
+      "Thomas Gebhart",
+      "Udit Saxena",
+      "Paul R. Schrater"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Paul_R._Schrater1",
+      "~Thomas_Gebhart1"
+    ],
+    "rank": 1943
+  },
+  {
+    "url": "https://openreview.net/forum?id=xvWZQtxI7qq",
+    "ratings": [
+      6,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.86",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Machine learning models often suffer from the problem of over-fitting. Many data augmentation methods have been proposed to tackle such a problem, and one of them is called mixup. Mixup is a recently proposed regularization procedure, which linearly interpolates a random pair of training examples. This regularization method works very well experimentally, but its theoretical guarantee is not adequately discussed. In this study, we aim to discover why mixup works well from the aspect of the statistical learning theory. In addition, we reveal how the effect of mixup changes in each situation. Furthermore, we also investigated the effects of changes in the parameter of mixup. Our work contributes to searching for the optimal parameters and estimating the effects of the parameters currently used. The results of this study provide a theoretical clarification of when and how effective regularization by mixup is.",
+    "title": "Mixup Training as the Complexity Reduction",
+    "authors": [
+      "Masanari Kimura"
+    ],
+    "emails": [
+      "~Masanari_Kimura2"
+    ],
+    "rank": 1944
+  },
+  {
+    "url": "https://openreview.net/forum?id=UFJOP5w0kV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.86",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Graph convolutional networks have achieved great success on graph-structured data. Many graph convolutional networks can be regarded as low-pass filters for graph signals. In this paper, we propose a new model, BiGCN, which represents a graph neural network as a bi-directional low-pass filter. Specifically, we not only consider the original graph structure information but also the latent correlation between features, thus BiGCN can filter the signals along with both the original graph and a latent feature-connection graph. Our model outperforms previous graph neural networks in the tasks of node classification and link prediction on benchmark datasets, especially when we add noise to the node features.",
+    "title": "BiGCN: A Bi-directional Low-Pass Filtering Graph Neural Network",
+    "authors": [
+      "Zhixian Chen",
+      "Tengfei Ma",
+      "Zhihua Jin",
+      "Yangqiu Song",
+      "Yang Wang"
+    ],
+    "emails": [
+      "ust.hk",
+      "~Tengfei_Ma1",
+      "~Yangqiu_Song1",
+      "~Zhihua_Jin1",
+      "~Zhixian_Chen1"
+    ],
+    "rank": 1945
+  },
+  {
+    "url": "https://openreview.net/forum?id=sWMF2rf9nQt",
+    "ratings": [
+      3,
+      5,
+      7,
+      5
+    ],
+    "rating": "4.86",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Despite the impressive performance in various pattern recognition tasks, deep neural networks (DNNs) are typically overconfident in their predictions, making it difficult to determine whether a test example is misclassified. In this paper, we propose a simple yet effective method of class augmentation (classAug) to address the challenge of misclassification detection in DNNs. Specifically, we increase the number of classes during training by assigning new classes to the samples generated using between-class interpolation. In spite of the simplicity, extensive experiments demonstrate that the misclassification detection performance of DNNs can be significantly improved by seeing more generated pseudo-classes during training. Additionally, we observe that DNNs trained with classAug are more robust on out-of-distribution examples and better calibrated. Finally, as a general regularization strategy, classAug can also enhance the original classification accuracy and few-shot learning performance.",
+    "title": "Misclassification Detection via Class Augmentation",
+    "authors": [
+      "Fei Zhu",
+      "Xu-yao Zhang",
+      "Chuang Wang",
+      "Cheng-lin Liu"
+    ],
+    "emails": [
+      "~Xu-yao_Zhang1",
+      "~Chuang_Wang2",
+      "~Fei_Zhu1",
+      "~Cheng-lin_Liu1"
+    ],
+    "rank": 1946
+  },
+  {
+    "url": "https://openreview.net/forum?id=W0MKrbVOxtd",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      8,
+      4,
+      4
+    ],
+    "rating": "4.86",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Spatiotemporal forecasting plays an essential role in intelligent transportation systems (ITS) and numerous applications, such as route planning, navigation, and automatic driving. Deep Spatiotemporal Graph Neural Networks, which capture both spatial and temporal patterns, have achieved great success in traffic forecasting applications. Though Deep Neural Networks (DNNs) have been proven to be vulnerable to carefully designed perturbations in multiple domains like objection classification and graph classification, these adversarial works cannot be directly applied to spatiotemporal GNNs because of their causality and spatiotemporal mechanism. There is still a lack of studies on the vulnerability and robustness of spatiotemporal GNNs. Particularly, if spatiotemporal GNNs are vulnerable in real-world traffic applications, a hacker can easily cause serious traffic congestion and even a city-scale breakdown. To fill this gap, we design One Vertex Attack to break deep spatiotemporal GNNs by attacking a single one vertex. To achieve this, we apply the genetic algorithm with a universal attack method as the evaluation function to locate the weakest vertex; then perturbations are generated by solving an optimization problem with the inverse estimation. Empirical studies prove that perturbations in one vertex can be diffused into most of the graph when spatiotemporal GNNs are under One Vertex Attack.",
+    "title": "One Vertex Attack on Graph Neural Networks-based Spatiotemporal Forecasting",
+    "authors": [
+      "Fuqiang Liu",
+      "Luis Miranda Moreno",
+      "Lijun Sun"
+    ],
+    "emails": [
+      "mcgill.ca",
+      "~Fuqiang_Liu2"
+    ],
+    "rank": 1947
+  },
+  {
+    "url": "https://openreview.net/forum?id=WvXOdsBBvt",
+    "ratings": [
+      6,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.86",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We study a worst-case scenario in generalization: Out-of-domain generalization from a single source. The goal is to learn a robust model from a single source and expect it to generalize over many unknown distributions. This challenging problem has been seldom investigated while existing solutions suffer from various limitations. In this paper, we propose a new solution. The key idea is to augment the source capacity in both input and label spaces, while the augmentation is guided by uncertainty assessment. To the best of our knowledge, this is the first work to (1) access the generalization uncertainty from a single source and (2) leverage it to guide both input and label augmentation for robust generalization. The model training and deployment are effectively organized in a Bayesian meta-learning framework. We conduct extensive comparisons and ablation study to validate our approach. The results prove our superior performance in a wide scope of tasks including image classification, text classification, and speech recognition.",
+    "title": "Uncertain Out-of-Domain Generalization",
+    "authors": [
+      "Fengchun Qiao",
+      "Xi Peng"
+    ],
+    "emails": [
+      "~Fengchun_Qiao1",
+      "~Xi_Peng1"
+    ],
+    "rank": 1948
+  },
+  {
+    "url": "https://openreview.net/forum?id=UFWnZn2v0bV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.86",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Sparsity has become popular in machine learning, because it can save computational resources, facilitate interpretations, and prevent overfitting. In this paper, we discuss sparsity in the framework of neural networks. In particular, we formulate a new notion of sparsity that concerns the networks\u2019 layers and, therefore, aligns particularly well with the current trend toward deep networks. We call this notion layer sparsity. We then introduce corresponding regularization and refitting schemes that can complement standard deep-learning pipelines to generate more compact and accurate networks.",
+    "title": "LAYER SPARSITY IN NEURAL NETWORKS",
+    "authors": [
+      "Mohamed Hebiri",
+      "Johannes Lederer"
+    ],
+    "emails": [
+      "~Johannes_Lederer1",
+      "univ-eiffel.fr"
+    ],
+    "rank": 1949
+  },
+  {
+    "url": "https://openreview.net/forum?id=99M-4QlinPr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      7
+    ],
+    "rating": "4.86",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Reinforcement learning from self-play has recently reported many successes. Self-play, where the agents compete with themselves, is often used to generate training data for iterative policy improvement. In previous work, heuristic rules are designed to choose an opponent for the current learner. Typical rules include choosing the latest agent, the best agent, or a random historical agent. However, these rules may be inefficient in practice and sometimes do not guarantee convergence even in the simplest matrix games. This paper proposes a new algorithmic framework for competitive self-play reinforcement learning in two-player zero-sum games. We recognize the fact that the Nash equilibrium coincides with the saddle point of the stochastic payoff function, which motivates us to borrow ideas from classical saddle point optimization literature. Our method simultaneously trains several agents and intelligently takes each other as opponents based on a simple adversarial rule derived from a principled perturbation-based saddle optimization method. We prove theoretically that our algorithm converges to an approximate equilibrium with high probability in convex-concave games under standard assumptions. Beyond the theory, we further show the empirical superiority of our method over baseline methods relying on the aforementioned opponent-selection heuristics in matrix games, grid-world soccer, Gomoku, and simulated robot sumo, with neural net policy function approximators.",
+    "title": "Efficient Competitive Self-Play Policy Optimization",
+    "authors": [
+      "Yuanyi Zhong",
+      "Yuan Zhou",
+      "Jian Peng"
+    ],
+    "emails": [
+      "~Yuan_Zhou1",
+      "~Jian_Peng1",
+      "~Yuanyi_Zhong1"
+    ],
+    "rank": 1950
+  },
+  {
+    "url": "https://openreview.net/forum?id=o5KkQBuMWCm",
+    "ratings": [
+      6,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.85",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Bayesian optimization, whose efficiency for automatic hyperparameter tuning has been verified over the decade, still faces a standing dilemma between massive consumption of time and suboptimal search results. Although much effort has been devoted to accelerate and improve the optimizer, the dominantly time-consuming step of evaluation receives relatively less attention. In this paper, we propose a novel online Bayesian algorithm, which optimizes hyperparameters and learns the training dynamics to make it free from the repeated complete evaluations. To solve the non-stationary problem i.e. the same hyperparameters will lead to varying results at different training steps, we combine the training loss and the dominant eigenvalue to track training dynamics. Compared to traditional algorithms, it saves time and utilizes the important intermediate information which are not well leveraged by classical Bayesian methods that only focus on the final results. The performance on CIFAR-10 and CIFAR-100 verifies the efficacy of our approach.",
+    "title": "GSdyn: Learning training dynamics via online Gaussian optimization with gradient states",
+    "authors": [
+      "Haoran Liao",
+      "Junchi Yan",
+      "Zimin Feng"
+    ],
+    "emails": [
+      "~Junchi_Yan2",
+      "cmbchina.com",
+      "~Haoran_Liao1"
+    ],
+    "rank": 1951
+  },
+  {
+    "url": "https://openreview.net/forum?id=0LlujmaN0R_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      5
+    ],
+    "rating": "4.85",
+    "confidences": [
+      4,
+      4,
+      2,
+      3
+    ],
+    "abstract": "We present a general framework for evolutionary learning to emergent unbiased state representation without any supervision. Evolutionary frameworks such as self-play converge to bad local optima in case of multi-agent reinforcement learning in non-cooperative partially observable environments with communication due to information asymmetry.  Our proposed framework is a simple modification of self-play inspired by mechanism design, also known as {\\em reverse game theory}, to elicit truthful signals and make the agents cooperative. The key idea is to add imaginary rewards using the peer prediction method, i.e., a mechanism for evaluating the validity of information exchanged between agents in a decentralized environment. Numerical experiments with predator prey, traffic junction and StarCraft tasks demonstrate that the state-of-the-art performance of our framework.\n",
+    "title": "Truthful Self-Play",
+    "authors": [
+      "Shohei Ohsawa"
+    ],
+    "emails": [
+      "~Shohei_Ohsawa1"
+    ],
+    "rank": 1952
+  },
+  {
+    "url": "https://openreview.net/forum?id=ccwT339SIu",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6
+    ],
+    "rating": "4.85",
+    "confidences": [
+      5,
+      5,
+      3
+    ],
+    "abstract": "Existing methods for video generation struggle to generate more than a short sequence of frames. We introduce a non-parametric approach for infinite video generation based on learning to resample frames from an input video. Our work is inspired by Video Textures, a classic method relying on pixel similarity to stitch sequences of frames, which performs well for videos with a high degree of regularity but fails in less constrained settings. Our method learns a distance metric to compare frames in a manner that scales to more challenging dynamics and allows for conditioning on heterogeneous data, such as audio. We learn representations for video frames and probabilities of transitioning by fitting a video-specific bi-gram model trained using contrastive learning. To synthesize the texture, we represent the video as a graph where the nodes are frames and edges are transitions with probabilities predicted by our video-specific model. By randomly traversing edges with high transition probabilities, we generate diverse temporally smooth videos with novel sequences and transitions. The model naturally extends with no additional training to handle the task of Audio Conditioned Video Synthesis, when conditioned on an audio signal. Our model outperforms baselines on human perceptual scores, can handle a diverse range of input videos, and can combine semantic and audio-visual cues in order to synthesize videos that synchronize well with an audio signal.",
+    "title": "Contrastive Video Textures",
+    "authors": [
+      "Medhini Narasimhan",
+      "Shiry Ginosar",
+      "Andrew Owens",
+      "Alexei A Efros",
+      "Trevor Darrell"
+    ],
+    "emails": [
+      "~Alexei_A_Efros1",
+      "~Medhini_Narasimhan1",
+      "~Trevor_Darrell2",
+      "~Andrew_Owens1",
+      "~Shiry_Ginosar1"
+    ],
+    "rank": 1953
+  },
+  {
+    "url": "https://openreview.net/forum?id=e60-SyRXtRt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.85",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Attribution methods have been shown as promising approaches for identifying key features that led to learned model predictions. While most existing attribution methods rely on a baseline input for performing feature perturbations, limited research has been conducted to address the baseline selection issues. Poor choices of baselines can lead to unfair attributions as well as limited ability of one-vs-one explanations for multi-class classifiers, which means explaining why the input belongs to its original class but not the other specified target class. Achieving one-vs-one explanation is crucial when certain classes are more similar than others, e.g. two bird types among multiple animals. One-vs-one explanations focus on key differentiating features rather than features shared across the original and the target classes. In this paper, we present GANMEX, a novel algorithm applying Generative Adversarial Networks (GAN) by incorporating the to-be-explained classifier as part of the adversarial networks. Our approach effectively selects the baseline as the closest realistic sample belong to the target class, which allows attribution methods to provide true one-vs-one explanations. We showed that GANMEX baselines improved the saliency maps visually and led to stronger performance on perturbation-based evaluation metrics over the existing baselines. Attribution results with the existing baselines are known to be insensitive to model randomization, and we demonstrated that GANMEX baselines led to better outcome under the randomization sanity checks.",
+    "title": "GANMEX: Class-Targeted One-vs-One Attributions using GAN-based Model Explainability",
+    "authors": [
+      "Sheng-Min Shih",
+      "Pin-Ju Tien",
+      "Zohar Karnin"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Sheng-Min_Shih1",
+      "~Zohar_Karnin1"
+    ],
+    "rank": 1954
+  },
+  {
+    "url": "https://openreview.net/forum?id=NZj7TnMr01",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      5,
+      6
+    ],
+    "rating": "4.85",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Neural networks have proven successful at learning from complex data distributions by acting as universal function approximators. However, neural networks are often overconfident in their predictions, which leads to inaccurate and miscalibrated probabilistic predictions. The problem of overconfidence becomes especially apparent in cases where the test-time data distribution differs from that which was seen during training. We propose a solution to this problem by seeking out regions in arbitrary feature space where the model is unjustifiably overconfident, and conditionally raising the entropy of those predictions towards that of the Bayesian prior on the distribution of the labels. Our method results in a better calibrated network and is agnostic to the underlying model structure, so it can be applied to any neural network which produces a probability density as an output. We demonstrate the effectiveness of our method and validate its performance on both classification and regression problems by applying it to the training of recent state-of-the-art neural network models.",
+    "title": "Improving Neural Network Accuracy and Calibration Under Distributional Shift with Prior Augmented Data ",
+    "authors": [
+      "Jeffrey Ryan Willette",
+      "Juho Lee",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Juho_Lee2",
+      "~Jeffrey_Ryan_Willette1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 1955
+  },
+  {
+    "url": "https://openreview.net/forum?id=D5Wt3FtvCF",
+    "decision": "Concerns raised (can publish with adjustment)",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.85",
+    "confidences": [
+      3,
+      1,
+      4,
+      5
+    ],
+    "abstract": "Commercial recommendation can be regarded as an interactive process between the recommendation platform and its target users. One crucial problem for the platform is how to make full use of its advantages so as to maximize its utility, i.e., the commercial benefits from recommendation. In this paper, we propose a novel recommendation framework which effectively utilizes the information of user uncertainty over different item dimensions and explicitly takes into consideration the impact of display policy on user in order to achieve maximal expected posterior utility for the platform. We formulate the problem of deriving optimal policy to achieve maximal expected posterior utility as a constrained non-convex optimization problem and further propose an ADMM-based solution to derive an approximately optimal policy. Extensive experiments are conducted over data collected from a real-world recommendation platform and demonstrate the effectiveness of the proposed framework. Besides, we also adopt the proposed framework to conduct experiments with an intent to reveal how the platform achieves its commercial benefits. The results suggest that the platform should cater to the user's preference for item dimensions that the user prefers, while for item dimensions where the user is with high uncertainty, the platform can achieve more commercial benefits by recommending items with high utilities.",
+    "title": "PURE: An Uncertainty-aware Recommendation Framework for Maximizing Expected Posterior Utility of Platform",
+    "authors": [
+      "Haokun Chen",
+      "Zhaoyang Liu",
+      "Chen Xu",
+      "Ziqian Chen",
+      "Jinyang Gao",
+      "Bolin Ding"
+    ],
+    "emails": [
+      "~Haokun_Chen1",
+      "~Chen_Xu2",
+      "~Ziqian_Chen1",
+      "alibaba-inc.com",
+      "~Bolin_Ding3"
+    ],
+    "rank": 1956
+  },
+  {
+    "url": "https://openreview.net/forum?id=bVzUDC_4ls",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      8,
+      3
+    ],
+    "rating": "4.83",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Motivated by the need to reliably characterize the robustness of deep neural networks, researchers have developed verification algorithms for deep neural networks. Given a neural network, the verifiers aim to answer whether certain properties are guaranteed with respect to all inputs in a space. However, little attention has been paid to floating point numerical error in neural network verification.\n\nWe exploit floating point errors in the inference and verification implementations to construct adversarial examples for neural networks that a verifier claims to be robust with respect to certain inputs. We argue that, to produce sound verification results, any verification system must accurately (or conservatively) model the effects of any float point computations in the network inference or verification system.",
+    "title": "Exploiting Verified Neural Networks via Floating Point Numerical Error",
+    "authors": [
+      "Kai Jia",
+      "Martin Rinard"
+    ],
+    "emails": [
+      "~Martin_Rinard1",
+      "~Kai_Jia2"
+    ],
+    "rank": 1957
+  },
+  {
+    "url": "https://openreview.net/forum?id=A-Sp6CR9-AA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.83",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We present Sandwich Batch Normalization (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D412 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D401 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D40D TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">SaBN</mtext></math></mjx-assistive-mml></mjx-container>), a frustratingly easy improvement of Batch Normalization (BN) with only a few lines of code changes. SaBN is motivated by addressing the inherent <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">feature distribution heterogeneity</mtext></math></mjx-assistive-mml></mjx-container> that one can be identified in many tasks, which can arise from model heterogeneity (dynamic architectures, model conditioning, etc.), or data heterogeneity (multiple input domains). A SaBN factorizes the BN affine layer into one shared <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">sandwich affine</mtext></math></mjx-assistive-mml></mjx-container> layer, cascaded by several parallel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">independent affine</mtext></math></mjx-assistive-mml></mjx-container> layers. Its variants include further decomposing the normalization layer into multiple parallel ones, and extending similar ideas to instance normalization. We demonstrate the prevailing effectiveness of SaBN (as well as its variants) as a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D41D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D428 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D429 TEX-B\"></mjx-c><mjx-c class=\"mjx-c2D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D429 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D425 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41C TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D426 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42D TEX-B\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D41F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D428 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D42D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42C TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D424 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42C TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">drop-in replacement in four tasks</mtext></math></mjx-assistive-mml></mjx-container>: neural architecture search (NAS), image generation, adversarial training, and style transfer. Leveraging SaBN immediately boosts two state-of-the-art weight-sharing NAS algorithms significantly on NAS-Bench-201; achieves better Inception Score and FID on CIFAR-10 and ImageNet conditional image generation with three state-of-the art GANs; substantially improves the robust and standard accuracy for adversarial defense; and produces superior arbitrary stylized results. We also provide visualizations and analysis to help understand why SaBN works. All our codes and pre-trained models will be released upon acceptance. ",
+    "title": "Sandwich Batch Normalization",
+    "authors": [
+      "Xinyu Gong",
+      "Wuyang Chen",
+      "Tianlong Chen",
+      "Zhangyang Wang"
+    ],
+    "emails": [
+      "~Zhangyang_Wang1",
+      "~Tianlong_Chen1",
+      "~Xinyu_Gong1",
+      "~Wuyang_Chen1"
+    ],
+    "rank": 1958
+  },
+  {
+    "url": "https://openreview.net/forum?id=dmCL033_YwO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.83",
+    "confidences": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Graph Convolutional Networks (GCNs) have been drawing significant attention with the power of representation learning on graphs. Recent works developed frameworks to train deep GCNs. Such works show impressive results in tasks like point cloud classification and segmentation, and protein interaction prediction. In this work, we study the performance of such deep models in large scale graph datasets from the Open Graph Benchmark (OGB). In particular, we look at the effect of adequately choosing an aggregation function, and its effect on final performance. Common choices of aggregation are mean, max, and sum. It has shown that GCNs are sensitive to such aggregations when applied to different datasets. We further validate this point and propose to alleviate it by introducing a novel Generalized Aggregation Function. Our new aggregation not only covers all commonly used ones, but also can be tuned to learn customized functions for different tasks. Our generalized aggregation is fully differentiable, and thus its parameters can be learned in an end-to-end fashion. We add our generalized aggregation into a deep GCN framework and show it achieves state-of-the-art results in six benchmarks from OGB.",
+    "title": "DeeperGCN: Training Deeper GCNs with Generalized Aggregation Functions",
+    "authors": [
+      "Guohao Li",
+      "Chenxin Xiong",
+      "Ali Thabet",
+      "Bernard Ghanem"
+    ],
+    "emails": [
+      "~Bernard_Ghanem1",
+      "kaust.edu.sa",
+      "~Ali_Thabet1",
+      "~Guohao_Li1"
+    ],
+    "rank": 1959
+  },
+  {
+    "url": "https://openreview.net/forum?id=aFvG-DNPNB9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7
+    ],
+    "rating": "4.83",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "The Variational Autoencoder (VAE) is a powerful framework for learning probabilistic latent variable generative models. However, typical assumptions on the approximate posterior distributions can substantially restrict its capacity for inference and generative modeling. Variational inference based on neural autoregressive models respects the conditional dependencies of the exact posterior, but this flexibility comes at a cost: the resulting models are expensive to train in high-dimensional regimes and can be slow to produce samples. In this work, we introduce an orthogonal solution, which we call self-reflective inference. By redesigning the hierarchical structure of existing VAE architectures, self-reflection ensures that the stochastic flow preserves the factorization of the exact posterior, sequentially updating the latent codes in a manner consistent with the generative model. We empirically demonstrate the advantages of matching the variational posterior to the exact posterior---on binarized MNIST self-reflective inference achieves state-of-the-art performance without resorting to complex, computationally expensive components such as autoregressive layers. Moreover, we design a variational normalizing flow that employs the proposed architecture, yielding predictive benefits compared to its purely generative counterpart. Our proposed modification is quite general and it complements the existing literature; self-reflective inference can naturally leverage advances in distribution estimation and generative modeling  to improve the capacity of each layer in the hierarchy.",
+    "title": "Self-Reflective Variational Autoencoder",
+    "authors": [
+      "Ifigeneia Apostolopoulou",
+      "Elan Rosenfeld",
+      "Artur Dubrawski"
+    ],
+    "emails": [
+      "~Elan_Rosenfeld1",
+      "~Artur_Dubrawski2",
+      "~Ifigeneia_Apostolopoulou1"
+    ],
+    "rank": 1960
+  },
+  {
+    "url": "https://openreview.net/forum?id=j0uePNuoBho",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      6
+    ],
+    "rating": "4.82",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "This paper presents a novel differentiable method for unstructured weight pruning of deep neural networks. Our learned-threshold pruning (LTP) method learns per-layer thresholds via gradient descent, unlike conventional methods where they are set as input. Making thresholds trainable also makes LTP computationally efficient, hence scalable to deeper networks. For example, it takes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>30</mn></math></mjx-assistive-mml></mjx-container> epochs for LTP to prune ResNet50 on ImageNet by a factor of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>9.1</mn></math></mjx-assistive-mml></mjx-container>. This is in contrast to other methods that search for per-layer thresholds via a computationally intensive iterative pruning and fine-tuning process. Additionally, with a novel differentiable <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> regularization, LTP is able to operate effectively on architectures with batch-normalization. This is important since <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> penalties lose their regularizing effect in networks with batch-normalization. Finally, LTP generates a trail of progressively sparser networks from which the desired pruned network can be picked based on sparsity and performance requirements. These features allow LTP to achieve competitive compression rates on ImageNet networks such as AlexNet (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>26.4</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> compression with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>79.1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> Top-5 accuracy) and ResNet50 (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>9.1</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> compression with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>92.0</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> Top-5 accuracy). We also show that LTP effectively prunes modern \\textit{compact} architectures, such as EfficientNet, MobileNetV2 and MixNet.",
+    "title": "Learned Threshold Pruning",
+    "authors": [
+      "Kambiz Azarian",
+      "Yash Sanjay Bhalgat",
+      "Jinwon Lee",
+      "Tijmen Blankevoort"
+    ],
+    "emails": [
+      "~Jinwon_Lee1",
+      "~Yash_Sanjay_Bhalgat1",
+      "~Tijmen_Blankevoort1",
+      "~Kambiz_Azarian1"
+    ],
+    "rank": 1961
+  },
+  {
+    "url": "https://openreview.net/forum?id=bzVsk7bnGdh",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.82",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Neural differential equations may be trained by backpropagating gradients via the adjoint method, which is another differential equation typically solved using an adaptive-step-size numerical differential equation solver. A proposed step is accepted if its error, \\emph{relative to some norm}, is sufficiently small; else it is rejected, the step is shrunk, and the process is repeated. Here, we demonstrate that the particular structure of the adjoint equations makes the usual choices of norm (such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>) unnecessarily stringent. By replacing it with a more appropriate (semi)norm, fewer steps are unnecessarily rejected and the backpropagation is made faster. This requires only minor code modifications. Experiments on a wide range of tasks---including time series, generative modeling, and physical control---demonstrate a median improvement of 40% fewer function evaluations. On some problems we see as much as 62% fewer function evaluations, so that the overall training time is roughly halved.",
+    "title": "\"Hey, that's not an ODE'\": Faster ODE Adjoints with 12 Lines of Code",
+    "authors": [
+      "Patrick Kidger",
+      "Ricky T. Q. Chen",
+      "Terry Lyons"
+    ],
+    "emails": [
+      "~Ricky_T._Q._Chen1",
+      "maths.ox.ac.uk",
+      "~Patrick_Kidger1"
+    ],
+    "rank": 1962
+  },
+  {
+    "url": "https://openreview.net/forum?id=5ZFeGYBBPgs",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5
+    ],
+    "rating": "4.82",
+    "confidences": [
+      3,
+      5,
+      3
+    ],
+    "abstract": "Quantification of uncertainty is one of the most promising  approaches to establish safe machine learning. Despite its importance, it is far from being generally solved, especially for neural networks. One of the most commonly used approaches so far is Monte Carlo dropout, which is computationally cheap and easy to apply in practice. However, it can underestimate the uncertainty. We propose a new objective, referred to as second-moment loss (SML), to address this issue. While the full network is encouraged to model the mean, the dropout networks are explicitly used to optimize the model variance. We analyze the performance of the new objective on various toy and UCI regression datasets. Comparing to the state-of-the-art of deep ensembles, SML leads to comparable prediction accuracies and uncertainty estimates while only requiring a single model. Under distribution shift, we observe moderate improvements. From a safety perspective also the study of worst-case uncertainties is crucial. In this regard we improve considerably. Finally, we show that SML can be successfully applied to SqueezeDet, a modern object detection network. We improve on its uncertainty-related scores while not deteriorating regression quality. As a side result, we introduce an intuitive Wasserstein distance-based uncertainty measure that is non-saturating and thus allows to resolve quality differences between any two uncertainty estimates.",
+    "title": "Second-Moment Loss: A Novel Regression Objective for Improved Uncertainties",
+    "authors": [
+      "Joachim Sicking",
+      "Maram Akila",
+      "Maximilian Alexander Pintz",
+      "Tim Wirtz",
+      "Asja Fischer",
+      "Stefan Wrobel"
+    ],
+    "emails": [
+      "~Stefan_Wrobel1",
+      "~Asja_Fischer1",
+      "~Tim_Wirtz1",
+      "iais.fraunhofer.de",
+      "~Joachim_Sicking1"
+    ],
+    "rank": 1963
+  },
+  {
+    "url": "https://openreview.net/forum?id=bM4Iqfg8M2k",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      2,
+      8,
+      3,
+      7
+    ],
+    "rating": "4.82",
+    "confidences": [
+      2,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Given the input graph and its label/property, several key problems  of graph learning, such as finding interpretable subgraphs, graph denoising and graph compression,  can be  attributed to the fundamental problem of recognizing a subgraph of the original one.  This subgraph shall be as informative as possible, yet contains less redundant and noisy structure. This problem setting is closely related to the well-known information bottleneck (IB) principle, which, however, has less been studied for the irregular graph data and graph neural networks (GNNs). In this paper, we propose a framework of Graph Information Bottleneck (GIB) for the subgraph recognition problem in deep graph learning. Under this framework, one can recognize the maximally informative yet compressive subgraph, named IB-subgraph.  However, the GIB objective is notoriously hard to optimize, mostly due to the intractability of the mutual information of irregular graph data and the unstable optimization process. In order to tackle these challenges, we propose:  i) a GIB objective based-on a mutual information estimator for the irregular graph data; ii) a bi-level optimization scheme to maximize the GIB objective; iii) a connectivity loss to stabilize the optimization process. We evaluate the properties of the IB-subgraph in three application scenarios: improvement of graph classification, graph interpretation and graph denoising. Extensive experiments demonstrate that the information-theoretic  IB-subgraph  enjoys superior graph properties. ",
+    "title": "Graph Information Bottleneck for Subgraph Recognition",
+    "authors": [
+      "Junchi Yu",
+      "Tingyang Xu",
+      "Yu Rong",
+      "Yatao Bian",
+      "Junzhou Huang",
+      "Ran He"
+    ],
+    "emails": [
+      "~Junchi_Yu1",
+      "~Yatao_Bian1",
+      "~Yu_Rong1",
+      "~Tingyang_Xu1",
+      "~Junzhou_Huang2",
+      "~Ran_He1"
+    ],
+    "rank": 1964
+  },
+  {
+    "url": "https://openreview.net/forum?id=zabJK7XTb-A",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.82",
+    "confidences": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Transductive inference is an effective means of tackling the data deficiency problem in few-shot learning settings. A popular transductive inference technique for few-shot metric-based approaches, is to update the prototype of each class with the mean of the most confident query examples, or confidence-weighted average of all the query samples. However, a caveat here is that the model confidence may be unreliable, which may lead to incorrect predictions. To tackle this issue, we propose to meta-learn the confidence for each query sample, to assign optimal weights to unlabeled queries such that they improve the model\u2019s transductive inference performance on unseen tasks. We achieve this by meta-learning an input-adaptive distance metric over a task distribution under various model and data perturbations, which will enforce consistency on the model predictions under diverse uncertainties for unseen tasks. We validate our few-shot learning model with meta-learned confidence on four benchmark datasets, on which it largely outperforms strong recent baselines and obtains new state-of-the-art results. Further application on semi-supervised few shot learning tasks also yields significant performance improvements over the baselines.",
+    "title": "Meta-Learned Confidence for Transductive Few-shot Learning",
+    "authors": [
+      "Seong Min Kye",
+      "Hae Beom Lee",
+      "Hoirin Kim",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Seong_Min_Kye1",
+      "~Sung_Ju_Hwang1",
+      "kaist.ac.kr",
+      "~Hae_Beom_Lee1"
+    ],
+    "rank": 1965
+  },
+  {
+    "url": "https://openreview.net/forum?id=IohHac70h3R",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4,
+      5,
+      8
+    ],
+    "rating": "4.82",
+    "confidences": [
+      4,
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Numerous adaptive algorithms such as AMSGrad and Radam have been proposed and applied to deep learning recently. However, these modifications do not improve the convergence rate of adaptive algorithms and whether a better algorithm exists still remains an open question. In this work, we propose a new motivation for designing the proximal function of adaptive algorithms, named as marginal regret bound minimization. Based on such an idea, we propose a new class of adaptive algorithms that not only achieves marginal optimality but can also potentially converge much faster than any existing adaptive algorithms in the long term. We show the superiority of the new class of adaptive algorithms both theoretically and empirically using experiments in deep learning. ",
+    "title": "On the Marginal Regret Bound Minimization of Adaptive Methods",
+    "authors": [
+      "Wenjie Li",
+      "Guang Cheng"
+    ],
+    "emails": [
+      "~Wenjie_Li2",
+      "~Guang_Cheng1"
+    ],
+    "rank": 1966
+  },
+  {
+    "url": "https://openreview.net/forum?id=MRQJmsNPp8E",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4
+    ],
+    "rating": "4.82",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Learning visual representations using large-scale unlabelled images is a holy grail for most of computer vision tasks. Recent contrastive learning methods have focused on encouraging the learned visual representations to be linearly separable among the individual items regardless of their semantic similarity; however, it could lead to a sub-optimal solution if a given downstream task is related to non-discriminative ones such as cluster analysis and information retrieval. In this work, we propose an advanced approach to consider the instance semantics in an unsupervised environment by both i) Contrasting batch-wise Cluster assignment features and ii) Bootstrapping an INstance representations without considering negatives simultaneously, referred to as C2BIN. Specifically, instances in a mini-batch are appropriately assigned to distinct clusters, each of which aims to capture apparent similarity among instances. Moreover, we introduce a pyramidal multi-heads technique, showing positive effects on the representations by capturing multi-scale semantics. Empirically, our method achieves comparable or better performance than both representation learning and clustering baselines on various benchmark datasets: CIFAR-10, CIFAR-100, and STL-10.",
+    "title": "Learning Representations by Contrasting Clusters While Bootstrapping Instances",
+    "authors": [
+      "Junsoo Lee",
+      "Hojoon Lee",
+      "Inkyu Shin",
+      "Jaekyoung Bae",
+      "In So Kweon",
+      "Jaegul Choo"
+    ],
+    "emails": [
+      "kakaoenterprise.com",
+      "kaist.ac.kr",
+      "~Inkyu_Shin1",
+      "~Junsoo_Lee1",
+      "~Jaegul_Choo1",
+      "~In_So_Kweon2"
+    ],
+    "rank": 1967
+  },
+  {
+    "url": "https://openreview.net/forum?id=N5Zacze7uru",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7
+    ],
+    "rating": "4.82",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "With a growing interest in data-driven control techniques, Model Predictive Control (MPC) provides a significant opportunity to exploit the surplus of data reliably, particularly while taking safety and stability into account. In this paper, we aim to infer the terminal cost of an MPC controller from transitions generated by an initial \\emph{unknown} demonstrator. We propose an algorithm to alternatively learn the terminal cost and update the MPC parameters according to a stability metric. We design the terminal cost as a Lyapunov function neural network and theoretically show that, under limited approximation error, our proposed approach guarantees that the size of the stability region (region of attraction) is greater than or equal to the one from the initial demonstrator. We also present theorems that characterize the stability and performance of the learned MPC in the presence of model uncertainties and sub-optimality due to function approximation. Empirically, we demonstrate the efficacy of the proposed algorithm on non-linear continuous control tasks with soft constraints. Our results show that the proposed approach can improve upon the initial demonstrator also in practice and achieve better task performance than other learning-based baselines. ",
+    "title": "Neural Lyapunov Model Predictive Control",
+    "authors": [
+      "Mayank Mittal",
+      "Marco Gallieri",
+      "Alessio Quaglino",
+      "Seyed Sina Mirrazavi Salehian",
+      "Jan Koutnik"
+    ],
+    "emails": [
+      "nnaisense.com",
+      "~Jan_Koutnik1",
+      "~Marco_Gallieri1",
+      "~Alessio_Quaglino1",
+      "~Mayank_Mittal1"
+    ],
+    "rank": 1968
+  },
+  {
+    "url": "https://openreview.net/forum?id=ok4MWWSeOJ1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.81",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Continual Learning addresses the challenge of learning a number of different distributions sequentially. The goal of maintaining knowledge of earlier distributions without re-accessing them starkly conflicts with standard SGD training for artificial neural networks. An influential method to tackle this are so-called regularisation approaches. They measure the importance of each parameter for modelling a given distribution and subsequently protect important parameters from large changes. In the literature, three ways to measure parameter importance have been put forward and they have inspired a large body of follow-up work. Here, we present strong theoretical and empirical evidence that these three methods, Elastic Weight Consolidation (EWC), Synaptic Intelligence (SI) and Memory Aware Synapses (MAS), all approximate the Fisher Information. Only EWC intentionally relies on the Fisher, while the other two methods stem from rather different motivations. We find that for SI the relation to the Fisher -- and in fact its performance -- is due to a previously unknown bias. Altogether, this unifies a large body of regularisation approaches. It also provides the first theoretical explanation for  the effectiveness of SI- and MAS-based algorithms and offers theoretically justified versions of these algorithms. From a practical viewpoint, our insights offer computational speed-ups and uncover conditions needed for different algorithms to work.",
+    "title": "Unifying Regularisation Methods for Continual Learning",
+    "authors": [
+      "Frederik Benzing"
+    ],
+    "emails": [
+      "~Frederik_Benzing1"
+    ],
+    "rank": 1969
+  },
+  {
+    "url": "https://openreview.net/forum?id=zleOqnAUZzl",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.81",
+    "confidences": [
+      4,
+      5,
+      2,
+      5
+    ],
+    "abstract": "Deep neural networks (DNNs) are known to produce incorrect predictions with very high confidence on out-of-distribution (OOD) inputs. This limitation is one of the key challenges in the adoption of deep learning models in high-assurance systems such as autonomous driving, air traffic management, and medical diagnosis. This challenge has received significant attention recently, and several techniques have been developed to detect inputs where the model's prediction cannot be trusted. These techniques use different statistical, geometric, or topological signatures. This paper presents a taxonomy of OOD outlier inputs based on their source and nature of uncertainty. We demonstrate how different existing detection approaches fail to detect certain types of outliers. We utilize these insights to develop a novel integrated detection approach that uses multiple attributes corresponding to different types of outliers. Our results include experiments on CIFAR10, SVHN and MNIST as in-distribution data and Imagenet, LSUN, SVHN (for CIFAR10), CIFAR10 (for SVHN), KMNIST, and F-MNIST as OOD data across different DNN architectures such as ResNet34, WideResNet, DenseNet, and LeNet5.",
+    "title": "Are all outliers alike?  On Understanding the Diversity of Outliers for Detecting OODs",
+    "authors": [
+      "Ramneet Kaur",
+      "Susmit Jha",
+      "Anirban Roy"
+    ],
+    "emails": [
+      "~Anirban_Roy3",
+      "~Susmit_Jha1",
+      "seas.upenn.edu"
+    ],
+    "rank": 1970
+  },
+  {
+    "url": "https://openreview.net/forum?id=uie1cYdC2B",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      7,
+      4
+    ],
+    "rating": "4.81",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Adversarial learning strategy has demonstrated remarkable performance in dealing with single-source unsupervised Domain Adaptation (DA) problems, and it has recently been applied to multi-source DA problems. While most of the existing DA methods use multiple domain discriminators, the effect of using multiple discriminators on the quality of latent space representations has been poorly understood. Here we provide theoretical insights into potential pitfalls of using multiple domain discriminators: First, domain-discriminative information is inevitably distributed across multiple discriminators. Second, it is not scalable in terms of computational resources. Third, the variance of stochastic gradients from multiple discriminators may increase, which significantly undermines training stability. To fully address these issues, we situate adversarial DA in the context of information regularization. First, we present a unified information regularization framework for multi-source DA. It provides a theoretical justification for using a single and unified domain discriminator to encourage the synergistic integration of the information gleaned from each domain. Second, this motivates us to implement a novel neural architecture called a Multi-source Information-regularized Adaptation Networks (MIAN). The proposed model significantly reduces the variance of stochastic gradients and increases computational-efficiency. Large-scale simulations on various multi-source DA scenarios demonstrate that MIAN, despite its structural simplicity, reliably outperforms other state-of-the-art methods by a large margin especially for difficult target domains.",
+    "title": "A Simple Unified Information Regularization Framework for Multi-Source Domain Adaptation",
+    "authors": [
+      "Geon Yeong Park",
+      "Sang wan Lee"
+    ],
+    "emails": [
+      "~Geon_Yeong_Park1",
+      "kaist.ac.kr"
+    ],
+    "rank": 1971
+  },
+  {
+    "url": "https://openreview.net/forum?id=ykCRDlfxmk",
+    "ratings": [
+      4,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.81",
+    "confidences": [
+      5,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Deep learning models often require extensive efforts in optimizing hyperparameters and architectures. Standard hyperparameter optimization methods are expensive because of their multi-trial nature: different configurations are tried separately to find the best. In this paper, we propose AutoHAS, an efficient framework for both hyperparameter and architecture search. AutoHAS generalizes the concept of efficient architecture search, ENAS and DARTS, to hyperparameter search and hence can jointly optimize both in a single training. A key challenge in such generalization is that ENAS and DARTS are designed to optimize discrete architecture choices, whereas hyperparameter choices are often continuous. To tackle this challenge, we discretize the continuous space into a linear combination of multiple categorical basis. Furthermore, we extend the idea of weight sharing and augment it with REINFORCE to reduce its memory cost. In order to decouple the shared network weights and controller optimization, we also propose to create temporary weights for evaluating the sampled hyperparameters and updating the controller. Experimental results show AutoHAS can improve the ImageNet accuracy by up to 0.8% for highly-optimized state-of-the-art ResNet/EfficientNet models, and up to 11% for less-optimized models. Compared to random search and Bayesian search, AutoHAS consistently achieves better accuracy with 10x less computation cost.",
+    "title": "AutoHAS: Efficient Hyperparameter and Architecture Search",
+    "authors": [
+      "Xuanyi Dong",
+      "Mingxing Tan",
+      "Adams Wei Yu",
+      "Daiyi Peng",
+      "Bogdan Gabrys",
+      "Quoc V Le"
+    ],
+    "emails": [
+      "~Daiyi_Peng1",
+      "~Quoc_V_Le1",
+      "~Mingxing_Tan3",
+      "~Bogdan_Gabrys1",
+      "~Xuanyi_Dong1",
+      "~Adams_Wei_Yu1"
+    ],
+    "rank": 1972
+  },
+  {
+    "url": "https://openreview.net/forum?id=pHgB1ASMgMW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "abstract": "Deep neural networks (DNNs) are known to be prone to adversarial attacks, for which many remedies are proposed. While adversarial training (AT) is regarded as the most robust defense, it suffers from poor performance both on clean examples and under other types of attacks, e.g. attacks with larger perturbations. Meanwhile, regularizers that encourage uncertain outputs, such as entropy maximization (EntM) and label smoothing (LS) can maintain accuracy on clean examples and improve performance under weak attacks, yet their ability to defend against strong attacks is still in doubt. In this paper, we revisit uncertainty promotion regularizers, including EntM and LS, in the field of adversarial learning. We show that EntM and LS alone provide robustness only under small perturbations. Contrarily, we show that uncertainty promotion regularizers complement AT in a principled manner, consistently improving performance on both clean examples and under various attacks, especially attacks with large perturbations. We further analyze how uncertainty promotion regularizers enhance the performance of AT from the perspective of Jacobian matrices <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-n\" noic=\"true\"><mjx-c class=\"mjx-c2207\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c3B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D703 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi mathvariant=\"normal\">\u2207</mi><mi>X</mi></msub><mi>f</mi><mo stretchy=\"false\">(</mo><mi>X</mi><mo>;</mo><mi>\u03b8</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, and find out that EntM effectively shrinks the norm of Jacobian matrices and hence promotes robustness. ",
+    "title": "Rethinking Uncertainty in Deep Learning: Whether and How it Improves Robustness",
+    "authors": [
+      "Yilun Jin",
+      "Lixin Fan",
+      "Kam Woh Ng",
+      "Ce Ju",
+      "Qiang Yang"
+    ],
+    "emails": [
+      "webank.com",
+      "~Ce_Ju1",
+      "~Qiang_Yang1",
+      "~Lixin_Fan1",
+      "~Yilun_Jin1"
+    ],
+    "rank": 1973
+  },
+  {
+    "url": "https://openreview.net/forum?id=qHXkE-8c1sQ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We provide a practical distance measure in the space of functions parameterized by neural networks. It is based on the classical information distance, and we propose to replace the uncomputable Kolmogorov complexity with information measured by codelength of prequential coding. We also provide a method for directly estimating the expectation of such codelength with limited examples. Empirically, we show that information distance is invariant with respect to different parameterization of the neural networks. We also verify that information distance can faithfully reflect similarities of neural network functions. Finally, we applied information distance to investigate the relationship between neural network models, and demonstrate the connection between information distance and multiple characteristics and behaviors of neural networks.",
+    "title": "Information distance for neural network functions",
+    "authors": [
+      "Xiao Zhang",
+      "Dejing Dou",
+      "Ji Wu"
+    ],
+    "emails": [
+      "~Dejing_Dou1",
+      "mail.tsinghua.edu.cn",
+      "~Xiao_Zhang9"
+    ],
+    "rank": 1974
+  },
+  {
+    "url": "https://openreview.net/forum?id=oY7La6DBTLx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent studies on one-class classification have achieved a remarkable performance, by employing the self-supervised classifier that predicts the geometric transformation applied to in-class images. However, they cannot identify in-class images at all when the input images are geometrically-transformed (e.g., rotated images), because their classification-based in-class scores assume that input images always have a fixed viewpoint, as similar to the images used for training. Pointing out that humans can easily recognize such transformed images as the same class, in this work, we aim to propose a one-class classifier robust to geometrically-transformed inputs, named as GROC. To this end, we introduce a conformity score which indicates how strongly an input image agrees with one of the predefined in-class transformations, then utilize the conformity score with our proposed agreement measures for one-class classification. Our extensive experiments demonstrate that GROC is able to accurately distinguish in-class images from out-of-class images regardless of whether the inputs are geometrically-transformed or not, whereas the existing methods fail.",
+    "title": "One-class Classification Robust to Geometric Transformation",
+    "authors": [
+      "Hyunjun Ju",
+      "Dongha Lee",
+      "SeongKu Kang",
+      "Hwanjo Yu"
+    ],
+    "emails": [
+      "~Dongha_Lee1",
+      "postech.ac.kr",
+      "~Hyunjun_Ju1",
+      "~Hwanjo_Yu1"
+    ],
+    "rank": 1975
+  },
+  {
+    "url": "https://openreview.net/forum?id=wl0Kr_jqM2a",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Most existing adversarial defenses only measure robustness to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> adversarial attacks. Not only are adversaries unlikely to exclusively create small <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> perturbations, adversaries are unlikely to remain fixed. Adversaries adapt and evolve their attacks; hence adversarial defenses must be robust to a broad range of unforeseen attacks. We address this discrepancy between research and reality by proposing a new evaluation framework called ImageNet-UA. Our framework enables the research community to test ImageNet model robustness against attacks not encountered during training. To create ImageNet-UA's diverse attack suite, we introduce a total of four novel adversarial attacks. We also demonstrate that, in comparison to ImageNet-UA, prevailing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> robustness assessments give a narrow account of adversarial robustness. By evaluating current defenses with ImageNet-UA, we find they provide little robustness to unforeseen attacks. We hope the greater variety and realism of ImageNet-UA enables development of more robust defenses which can generalize beyond attacks seen during training.",
+    "title": "Testing Robustness Against Unforeseen Adversaries",
+    "authors": [
+      "Daniel Kang",
+      "Yi Sun",
+      "Dan Hendrycks",
+      "Tom B Brown",
+      "Jacob Steinhardt"
+    ],
+    "emails": [
+      "~Jacob_Steinhardt1",
+      "~Tom_B_Brown1",
+      "~Dan_Hendrycks1",
+      "~Daniel_Kang1",
+      "~Yi_Sun3"
+    ],
+    "rank": 1976
+  },
+  {
+    "url": "https://openreview.net/forum?id=0rNLjXgchOC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      7,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      2,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Hessian captures important properties of the deep neural network loss landscape. We observe that eigenvectors and eigenspaces of the layer-wise Hessian for neural network objective have several interesting structures -- top eigenspaces for different models have high overlap, and top eigenvectors form low rank matrices when they are reshaped into the same shape as the weight matrix of the corresponding layer. These structures, as well as the low rank structure of the Hessian observed in previous studies, can be explained by approximating the Hessian using Kronecker factorization. Our new understanding can also explain why some of these structures become weaker when the network is trained with batch normalization. Finally, we show that the Kronecker factorization can be combined with PAC-Bayes techniques to get better generalization bounds.",
+    "title": "Dissecting Hessian: Understanding Common Structure of Hessian in Neural Networks",
+    "authors": [
+      "Yikai Wu",
+      "Xingyu Zhu",
+      "Chenwei Wu",
+      "Annie N. Wang",
+      "Rong Ge"
+    ],
+    "emails": [
+      "~Chenwei_Wu1",
+      "~Annie_N._Wang1",
+      "~Xingyu_Zhu1",
+      "~Yikai_Wu1",
+      "~Rong_Ge1"
+    ],
+    "rank": 1977
+  },
+  {
+    "url": "https://openreview.net/forum?id=MBdafA3G9k",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "It would be desirable for a reinforcement learning (RL) based agent to learn behaviour by merely watching a demonstration. However, defining rewards that facilitate this goal within the RL paradigm remains a challenge. Here we address this problem with Siamese networks, trained to compute distances between observed behaviours and an agent's behaviours. We use an RNN-based comparator model to learn such distances in space and time between motion clips while training an RL policy to minimize this distance. Through experimentation, we have also found that the inclusion of multi-task data and an additional image encoding loss helps enforce temporal consistency and improve policy learning. These two components appear to balance reward for matching a specific instance of a behaviour versus that behaviour in general. Furthermore, we focus here on a particularly challenging form of this problem where only a single demonstration is provided for a given task -- the one-shot learning setting. We demonstrate our approach on humanoid, dog and raptor agents in 2D and a 3D quadruped and humanoid. In these environments, we show that our method outperforms the state-of-the-art, GAIfO (i.e. GAIL without access to actions) and TCNs.",
+    "title": "Visual Imitation with Reinforcement Learning using Recurrent Siamese Networks",
+    "authors": [
+      "Glen Berseth",
+      "Florian Golemo",
+      "Christopher Pal"
+    ],
+    "emails": [
+      "~Christopher_Pal1",
+      "~Florian_Golemo1",
+      "~Glen_Berseth1"
+    ],
+    "rank": 1978
+  },
+  {
+    "url": "https://openreview.net/forum?id=xFYXLlpIyPQ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      8
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Learning-to-learn---using optimization algorithms to learn a new optimizer---has successfully trained efficient optimizers in practice. This approach relies on meta-gradient descent on a meta-objective based on the trajectory that the optimizer generates. However, there were few theoretical guarantees on how to avoid meta-gradient explosion/vanishing problem, or how to train an optimizer with good generalization performance. In this paper, we study the learning-to-learn approach on a simple problem of tuning the step size for quadratic loss. Our results show that although there is a way to design the meta-objective so that the meta-gradient remains polynomially bounded, computing the meta-gradient directly using backpropagation leads to numerical issues that look similar to gradient explosion/vanishing problems. We also characterize when it is necessary to compute the meta-objective on a separate validation set instead of the original training set. Finally, we verify our results empirically and show that a similar phenomenon appears even for more complicated learned optimizers parametrized by neural networks.",
+    "title": "Guarantees for Tuning the Step Size using a Learning-to-Learn Approach",
+    "authors": [
+      "Xiang Wang",
+      "Shuai Yuan",
+      "Chenwei Wu",
+      "Rong Ge"
+    ],
+    "emails": [
+      "~Xiang_Wang1",
+      "cs.duke.edu",
+      "~Chenwei_Wu1",
+      "~Rong_Ge1"
+    ],
+    "rank": 1979
+  },
+  {
+    "url": "https://openreview.net/forum?id=fycxGdpCCmW",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Contrastive learning and supervised learning have both seen significant progress and success. However, thus far they have largely been treated as two separate objectives, brought together only by having a shared neural network. In this paper we show that through the perspective of hybrid discriminative-generative training of energy-based models we can make a direct connection between contrastive learning and supervised learning. Beyond presenting this unified view, we show our specific choice of approximation of the energy-based loss significantly improves energy-based models and contrastive learning based methods in confidence-calibration, out-of-distribution detection, adversarial robustness, generative modeling, and image classification tasks. In addition to significantly improved performance, our method also gets rid of SGLD training and does not suffer from training instability. Our evaluations also demonstrate that our method performs better than or on par with state-of-the-art hand-tailored methods in each task. ",
+    "title": "Hybrid Discriminative-Generative Training via Contrastive Learning",
+    "authors": [
+      "Hao Liu",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Pieter_Abbeel2",
+      "~Hao_Liu1"
+    ],
+    "rank": 1980
+  },
+  {
+    "url": "https://openreview.net/forum?id=rkQuFUmUOg3",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      4,
+      5
+    ],
+    "rating": "4.80",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "Despite the success of recent Neural Architecture Search (NAS) methods on various tasks which have shown to output networks that largely outperform human-designed networks, conventional NAS methods have mostly tackled the optimization of searching for the network architecture for a single task (dataset), which does not generalize well across multiple tasks (datasets). Moreover, since such task-specific methods search for a neural architecture from scratch for every given task, they incur a large computational cost, which is problematic when the time and monetary budget are limited. In this paper, we propose an efficient NAS framework that is trained once on a database consisting of datasets and pretrained networks and can rapidly search for a neural architecture for a novel dataset. The proposed MetaD2A (Meta Dataset-to-Architecture) model can stochastically generate graphs (architectures) from a given set (dataset) via a cross-modal latent space learned with amortized meta-learning. Moreover, we also propose a meta-performance predictor to estimate and select the best architecture without direct training on target datasets. The experimental results demonstrate that our model meta-learned on subsets of ImageNet-1K and architectures from NAS-Bench 201 search space successfully generalizes to multiple unseen datasets including CIFAR-10 and CIFAR-100, with an average search time of 33 GPU seconds. Even under MobileNetV3 search space, MetaD2A is 5.5K times faster than NSGANetV2, a transferable NAS method, with comparable performance. We believe that the MetaD2A proposes a new research direction for rapid NAS as well as ways to utilize the knowledge from rich databases of datasets and architectures accumulated over the past years. Code is available at <a href=\"https://github.com/HayeonLee/MetaD2A\" target=\"_blank\" rel=\"nofollow\">https://github.com/HayeonLee/MetaD2A</a>.",
+    "title": "Rapid Neural Architecture Search by Learning to Generate Graphs from Datasets",
+    "authors": [
+      "Hayeon Lee",
+      "Eunyoung Hyung",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Hayeon_Lee1",
+      "~Sung_Ju_Hwang1",
+      "~Eunyoung_Hyung2"
+    ],
+    "rank": 1981
+  },
+  {
+    "url": "https://openreview.net/forum?id=XvOH0v2hsph",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      3
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Reliable yet efficient evaluation of generalisation performance of a proposed architecture is crucial to the success of neural architecture search (NAS). Traditional approaches face a variety of limitations: training each architecture to completion is prohibitively expensive, early stopping estimates may correlate poorly with fully trained performance, and model-based estimators require large training sets. Instead, motivated by recent results linking training speed and generalisation with stochastic gradient descent, we propose to estimate the final test performance based on the sum of training losses. Our estimator is inspired by the marginal likelihood, which is used for Bayesian model selection. Our model-free estimator is simple, efficient, and cheap to implement, and does not require hyperparameter-tuning or surrogate training before deployment. We demonstrate empirically that our estimator consistently outperforms other baselines under various settings and can achieve a rank correlation of 0.95 with final test accuracy on the NAS-Bench201 dataset within 50 epochs.",
+    "title": "Revisiting the Train Loss: an Efficient Performance Estimator for Neural Architecture Search",
+    "authors": [
+      "Binxin Ru",
+      "Clare Lyle",
+      "Lisa Schut",
+      "Mark van der Wilk",
+      "Yarin Gal"
+    ],
+    "emails": [
+      "~Clare_Lyle1",
+      "~Lisa_Schut2",
+      "~Binxin_Ru1",
+      "~Yarin_Gal1",
+      "~Mark_van_der_Wilk1"
+    ],
+    "rank": 1982
+  },
+  {
+    "url": "https://openreview.net/forum?id=IbFcpYnwCvd",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      6,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "abstract": "Learning composable policies for environments with complex rules and tasks is a challenging problem. We introduce a hierarchical reinforcement learning framework called the Logical Options Framework (LOF) that learns policies that are satisfying, optimal, and composable. LOF efficiently learns policies that satisfy tasks by representing the task as an automaton and integrating it into learning and planning. We provide and prove conditions under which LOF will learn satisfying, optimal policies. And lastly, we show how LOF's learned policies can be composed to satisfy unseen tasks with only 10-50 retraining steps. We evaluate LOF on four tasks in discrete and continuous domains.",
+    "title": "The Logical Options Framework",
+    "authors": [
+      "Brandon Araki",
+      "Xiao Li",
+      "Kiran Vodrahalli",
+      "Jonathan DeCastro",
+      "J Micah Fry",
+      "Daniela Rus"
+    ],
+    "emails": [
+      "ll.mit.edu",
+      "~Brandon_Araki1",
+      "tri.global",
+      "~Xiao_Li1",
+      "~Kiran_Vodrahalli1",
+      "~Daniela_Rus1"
+    ],
+    "rank": 1983
+  },
+  {
+    "url": "https://openreview.net/forum?id=ADwLLmSda3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      4,
+      2
+    ],
+    "abstract": "There is a significant demand for topic modeling on large-scale data with complex multi-modal structure in applications such as multi-layer network analysis, temporal document classification, and video data analysis; frequently this multi-modal data has latent hierarchical structure. We propose a new hierarchical nonnegative CANDECOMP/PARAFAC (CP) decomposition (hierarchical NCPD) model and a training method, Neural NCPD, for performing hierarchical topic modeling on multi-modal tensor data. Neural NCPD utilizes a neural network architecture and backpropagation to mitigate error propagation through hierarchical NCPD.  ",
+    "title": "Neural Nonnegative CP Decomposition for Hierarchical Tensor Analysis",
+    "authors": [
+      "Joshua Vendrow",
+      "Jamie Haddock",
+      "Deanna Needell"
+    ],
+    "emails": [
+      "~Jamie_Haddock1",
+      "~Deanna_Needell2",
+      "math.ucla.edu"
+    ],
+    "rank": 1984
+  },
+  {
+    "url": "https://openreview.net/forum?id=iF81jBISQDV",
+    "ratings": [
+      5,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Adversarial training has been demonstrated to be useful for improving the robustness of deep neural networks (DNNs). However, the impacts of basic network components (e.g., ReLU, the widely used activation function for DNNs) to adversarial training effectiveness received less attention and has not been comprehensively investigated so far. To fill this gap, in this work, we argue that the spatially-shared and input-independent activating properties of the ReLU make the DNNs under both standard training and adversarial training less robust to white-box adversarial attacks. To address such challenges, we design a novel activation function, i.e., Sparta: Spatially Attentive and Adversarially Robust Activation, which enables DNNs to achieve higher robustness (i.e., lower error rate on adversarial examples) and accuracy (i.e., lower error rate on clean examples) than the DNNs based on the state-of-the-art activation functions. We further investigate the relationships between our Sparta and the state-of-the-art search-based activation function, i.e., Swish, and feature denoising method, providing insights about the advantages of our method. Moreover, comprehensive evaluations have demonstrated two important properties of our method: First, superior transferability across DNNs. Our adversarially trained Sparta function for one DNN (e.g., ResNet-18) can be fixed to train another adversarially robust DNN (e.g., ResNet-34), achieving higher robustness than the one using vanilla ReLU as activation. Second, superior transferability across datasets. The Sparta function trained on one dataset (e.g., CIFAR-10) can be employed to train adversarially robust DNNs on another dataset (e.g., SVHN) and helps achieve higher robustness than DNNs with vanilla ReLU as activation.  These properties have highlighted the flexibility and versatility of Sparta. Accompanying code is also submitted.",
+    "title": "Sparta: Spatially Attentive and Adversarially Robust Activations",
+    "authors": [
+      "Qing Guo",
+      "Felix Juefei-Xu",
+      "Changqing Zhou",
+      "Lei Ma",
+      "Xiaofei Xie",
+      "Wei Feng",
+      "Yang Liu"
+    ],
+    "emails": [
+      "~Felix_Juefei-Xu2",
+      "~Xiaofei_Xie1",
+      "~Lei_Ma1",
+      "~Wei_Feng1",
+      "e.ntu.edu.sg",
+      "~Qing_Guo3",
+      "~Yang_Liu36"
+    ],
+    "rank": 1985
+  },
+  {
+    "url": "https://openreview.net/forum?id=Sm_4MDxPWXf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "There are two major classes of natural language grammars --- the dependency grammar that models one-to-one correspondences between words and the constituency grammar that models the assembly of one or several corresponded words. While previous unsupervised parsing methods mostly focus on only inducing one class of grammars, we introduce a novel model, StructFormer, that can induce dependency and constituency structure at the same time. To achieve this, we propose a new parsing framework that can jointly generates constituency tree and dependency graph. Then we integrate the induced dependency relations into transformer, in a differentiable manner, through a novel dependency-constrained self-attention mechanism. Experimental results show that our model can achieve strong results on unsupervised constituency parsing, unsupervised dependency parsing and masked language modeling at the same time.",
+    "title": "StructFormer: Joint Unsupervised Induction of Dependency and Constituency Structure from Masked Language Modeling",
+    "authors": [
+      "Yikang Shen",
+      "Yi Tay",
+      "Che Zheng",
+      "Dara Bahri",
+      "Donald Metzler",
+      "Aaron Courville"
+    ],
+    "emails": [
+      "~Dara_Bahri1",
+      "google.com",
+      "~Yi_Tay1",
+      "~Yikang_Shen1",
+      "~Aaron_Courville3"
+    ],
+    "rank": 1986
+  },
+  {
+    "url": "https://openreview.net/forum?id=tf8a4jDRFCv",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.80",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": " Learning on sets is increasingly gaining attention in the machine learning community, due to its widespread applicability. Typically, representations over sets are computed by using fixed aggregation functions such as sum or maximum. However, recent results showed that universal function representation by sum- (or max-) decomposition requires either highly discontinuous (and thus poorly learnable) mappings, or a latent dimension equal to the maximum number of elements in the set. To mitigate this problem, we introduce LAF (Learning Aggregation Functions), a learnable aggregator for sets of arbitrary cardinality. LAF can approximate several extensively used aggregators (such as average, sum, maximum)  as well as more complex functions (e.g. variance and skewness). We report experiments on semi-synthetic and real data showing that LAF  outperforms state-of-the-art sum- (max-) decomposition architectures such as DeepSets and library-based architectures like Principal Neighborhood Aggregation.",
+    "title": "Learning Aggregation Functions",
+    "authors": [
+      "Giovanni Pellegrini",
+      "Alessandro Tibo",
+      "Paolo Frasconi",
+      "Andrea Passerini",
+      "Manfred Jaeger"
+    ],
+    "emails": [
+      "~Andrea_Passerini2",
+      "~Paolo_Frasconi1",
+      "~Giovanni_Pellegrini1",
+      "cs.aau.dk",
+      "~Alessandro_Tibo1"
+    ],
+    "rank": 1987
+  },
+  {
+    "url": "https://openreview.net/forum?id=EArH-0iHhIq",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.79",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "During the last decade, neural networks have been intensively used to tackle various problems and they have often led to state-of-the-art results. These networks are composed of multiple jointly optimized layers arranged in a hierarchical structure. At each layer, the aim is to learn to extract hidden patterns needed to solve the problem at hand and forward it to the next layers. In the standard form, a neural network is trained with gradient-based optimization, where the errors are back-propagated from the last layer back to the first one. Thus at each optimization step, neurons at a given layer receive feedback from neurons belonging to higher layers of the hierarchy. In this paper, we propose to complement this traditional 'between-layer' feedback with additional 'within-layer' feedback to encourage diversity of the activations within the same layer. To this end, we measure the pairwise similarity between the outputs of the neurons and use it to model the layer's overall diversity. By penalizing similarities and promoting diversity, we encourage each neuron to learn a distinctive representation and, thus, to enrich the data representation learned within the layer and to increase the total capacity of the model. We theoretically study how the within-layer activation diversity affects the generalization performance of a neural network in a supervised context and we prove that increasing the diversity of hidden activations reduces the estimation error. In addition to the theoretical guarantees, we present an empirical study confirming that the proposed approach enhances the performance of neural networks.",
+    "title": "ON NEURAL NETWORK GENERALIZATION VIA PROMOTING WITHIN-LAYER ACTIVATION DIVERSITY",
+    "authors": [
+      "Firas Laakom",
+      "Jenni Raitoharju",
+      "Alexandros Iosifidis",
+      "Moncef Gabbouj"
+    ],
+    "emails": [
+      "~Moncef_Gabbouj1",
+      "~Jenni_Raitoharju1",
+      "~Firas_Laakom1",
+      "~Alexandros_Iosifidis2"
+    ],
+    "rank": 1988
+  },
+  {
+    "url": "https://openreview.net/forum?id=vSttC0bV3Ji",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We consider the problem of modeling the dynamics of continuous spatial-temporal processes represented by irregular samples through both space and time. Such processes occur in sensor networks, citizen science, multi-robot systems, and many others. \nWe propose a new deep model that is able to directly learn and predict over this irregularly sampled data, without voxelization, by leveraging a recent convolutional architecture for static point clouds. The model also easily incorporates the notion of multiple entities in the process. In particular, the model can flexibly answer prediction queries about arbitrary space-time points for different entities regardless of the distribution of the training or test-time data. We present experiments on real-world weather station data and battles between large armies in StarCraft II. The results demonstrate the model's flexibility in answering a variety of query types and demonstrate improved performance and efficiency compared to state-of-the-art baselines. ",
+    "title": "Deep Convolution for Irregularly Sampled Temporal Point Clouds",
+    "authors": [
+      "Erich Merrill III",
+      "Stefan Lee",
+      "Li Fuxin",
+      "Thomas G Dietterich",
+      "Alan Fern"
+    ],
+    "emails": [
+      "~Thomas_G_Dietterich1",
+      "~Alan_Fern1",
+      "~Stefan_Lee1",
+      "~Erich_Merrill_III1",
+      "~Li_Fuxin1"
+    ],
+    "rank": 1989
+  },
+  {
+    "url": "https://openreview.net/forum?id=MD3D5UbTcb1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      3,
+      7
+    ],
+    "rating": "4.79",
+    "confidences": [
+      3,
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Graph Neural Networks (GNNs) have risen to prominence in learning representations for graph structured data. A single GNN layer typically consists of a feature transformation and a feature aggregation operation. The former normally uses feed-forward networks to transform features, while the latter aggregates the transformed features over the graph. Numerous recent works have proposed GNN models with different designs in the aggregation operation. In this work, we establish mathematically that the aggregation processes in a group of representative GNN models including GCN, GAT, PPNP, and APPNP can be regarded as (approximately) solving a graph denoising problem with a smoothness assumption. Such a unified view across GNNs not only provides a new perspective to understand a variety of aggregation operations but also enables us to develop a unified graph neural network framework UGNN. To demonstrate its promising potential, we instantiate a novel GNN model, ADA-UGNN, derived from UGNN, to handle graphs with adaptive smoothness across nodes. Comprehensive experiments show the effectiveness of ADA-UGNN. ",
+    "title": "A Unified View on Graph Neural Networks as Graph Signal Denoising",
+    "authors": [
+      "Yao Ma",
+      "Xiaorui Liu",
+      "Tong Zhao",
+      "Yozen Liu",
+      "Jiliang Tang",
+      "Neil Shah"
+    ],
+    "emails": [
+      "snap.com",
+      "~Xiaorui_Liu1",
+      "~Yao_Ma3",
+      "~Tong_Zhao3",
+      "~Jiliang_Tang1"
+    ],
+    "rank": 1990
+  },
+  {
+    "url": "https://openreview.net/forum?id=7Yhok3vJpU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.79",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Learning from natural datasets poses significant challenges for traditional classification methods based on the cross-entropy objective due to imbalanced class distributions. It is intuitive to assume that the examples from rare classes are harder to learn so that the classifier is uncertain of the prediction, which establishes the low-likelihood area. Based on this, existing approaches drive the classifier actively to correctly predict those incorrect, rare examples. However, this assumption is one-sided and could be misleading. We find in practice that the high-likelihood area contains correct predictions for rare class examples and it plays a vital role in learning imbalanced class distributions. In light of this finding, we propose the Eureka Loss, which rewards the classifier when examples belong to rare classes in the high-likelihood area are correctly predicted. Experiments on the large-scale long-tailed iNaturalist 2018 classification dataset and the ImageNet-LT benchmark both validate the proposed approach. We further analyze the influence of the Eureka Loss in detail on diverse data distributions.",
+    "title": "High-Likelihood Area Matters --- Rewarding Correct,Rare Predictions Under Imbalanced Distributions",
+    "authors": [
+      "guangxiang zhao",
+      "Lei Li",
+      "Xuancheng Ren",
+      "Xu Sun",
+      "Bin He"
+    ],
+    "emails": [
+      "stu.pku.edu.cn",
+      "~Xu_Sun1",
+      "huawei.com",
+      "~guangxiang_zhao2",
+      "~Xuancheng_Ren1"
+    ],
+    "rank": 1991
+  },
+  {
+    "url": "https://openreview.net/forum?id=jOQbDGngsg8",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      3,
+      6
+    ],
+    "rating": "4.79",
+    "confidences": [
+      3,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Many data mining and analytical tasks rely on the abstraction of networks (graphs) to summarize relational structures among individuals (nodes). Since relational data are often sensitive, we aim to seek effective approaches to release utility-preserved yet privacy-protected structured data. In this paper, we leverage the differential privacy (DP) framework, to formulate and enforce rigorous privacy constraints on deep graph generation models, with a focus on edge-DP to guarantee individual link privacy. In particular, we enforce edge-DP by injecting Gaussian noise to the gradients of a link reconstruction based graph generation model, and ensure data utility by improving structure learning with structure-oriented graph comparison. Extensive experiments on two real-world network datasets show that our proposed DPGGAN model is able to generate networks with effectively preserved global structure and rigorously protected individual link privacy.",
+    "title": "Secure Network Release with Link Privacy",
+    "authors": [
+      "Carl Yang",
+      "Haonan Wang",
+      "Ke ZHANG",
+      "Lichao Sun"
+    ],
+    "emails": [
+      "~Haonan_Wang1",
+      "~Carl_Yang1",
+      "~Lichao_Sun1",
+      "~Ke_ZHANG7"
+    ],
+    "rank": 1992
+  },
+  {
+    "url": "https://openreview.net/forum?id=R43miizWtUN",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In recent years, Message-Passing Neural Networks (MPNNs), the most prominent Graph Neural Network (GNN) framework, have celebrated much success in the analysis of graph-structured data. In MPNNs the computations are split into three steps, Aggregation, Update and Readout. In this paper a series of models to successively sparsify the linear transform in the Update step is proposed. Specifically, the ExpanderGNN model with a tuneable sparsification rate and the Activation-Only GNN, which has no linear transform in the Update step, are proposed. In agreement with a growing trend in the relevant literature the sparsification paradigm is changed by initialising sparse neural network architectures rather than expensively sparsifying already trained architectures. These novel benchmark models enable a better understanding of the influence of the Update step on model performance and outperform existing simplified benchmark models such as the Simple Graph Convolution (SGC). The ExpanderGNNs, and in some cases the Activation-Only models, achieve performance on par with their vanilla counterparts on several down-stream graph prediction tasks, while containing exponentially fewer trainable parameters. In experiments with matching parameter numbers our benchmark models outperform the state-of-the-art GNNs models. These observations enable us to conclude that in practice the Update step often makes no positive contribution to the model performance.",
+    "title": "Analysing the Update step in Graph Neural Networks via Sparsification",
+    "authors": [
+      "changmin wu",
+      "Johannes F. Lutzeyer",
+      "Michalis Vazirgiannis"
+    ],
+    "emails": [
+      "~changmin_wu1",
+      "~Michalis_Vazirgiannis2",
+      "~Johannes_F._Lutzeyer1"
+    ],
+    "rank": 1993
+  },
+  {
+    "url": "https://openreview.net/forum?id=_b8l7rVPe8z",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "This paper focuses on high-transferable adversarial attacks on detectors, which are hard to attack in a black-box manner, because of their multiple-output characteristics and the diversity across architectures. To pursue a high attack transferability, one plausible way is to find a common property across detectors, which facilitates the discovery of common weaknesses. We are the first to suggest that the relevance map for detectors is such a property. Based on it, we design a Relevance Attack on Detectors (RAD), which achieves a state-of-the-art transferability, exceeding existing results by above 20%. On MS COCO, the detection mAPs for all 8 black-box architectures are more than halved and the segmentation mAPs are also significantly influenced. Given the great transferability of RAD, we generate the first adversarial dataset for object detection, i.e., Adversarial Objects in COntext (AOCO), which helps to quickly evaluate and improve the robustness of detectors.",
+    "title": "Relevance Attack on Detectors",
+    "authors": [
+      "Sizhe Chen",
+      "Fan He",
+      "Xiaolin Huang",
+      "Kun Zhang"
+    ],
+    "emails": [
+      "~Sizhe_Chen1",
+      "~Kun_Zhang1",
+      "~Xiaolin_Huang1",
+      "sjtu.edu.cn"
+    ],
+    "rank": 1994
+  },
+  {
+    "url": "https://openreview.net/forum?id=Lvb2BKqL49a",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      7,
+      5
+    ],
+    "rating": "4.79",
+    "confidences": [
+      5,
+      3,
+      2,
+      4
+    ],
+    "abstract": "With the variational lower bound of mutual information (MI), the estimation of MI can be understood as an optimization task via stochastic gradient descent. In this work, we start by showing how Mutual Information Neural Estimator (MINE) searches for the optimal function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> that maximizes the Donsker-Varadhan representation. With our synthetic dataset, we directly observe the neural network outputs during the optimization to investigate why MINE succeeds or fails: We discover the drifting phenomenon, where the constant term of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is shifting through the optimization process, and analyze the instability caused by the interaction between the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>l</mi><mi>o</mi><mi>g</mi><mi>s</mi><mi>u</mi><mi>m</mi><mi>e</mi><mi>x</mi><mi>p</mi></math></mjx-assistive-mml></mjx-container> and the insufficient batch size. Next, through theoretical and experimental evidence, we propose a novel lower bound that effectively regularizes the neural network to alleviate the problems of MINE. We also introduce an averaging strategy that produces an unbiased estimate by utilizing multiple batches to mitigate the batch size limitation. Finally, we show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> regularization achieves significant improvements in both discrete and continuous settings.",
+    "title": "Regularized Mutual Information Neural Estimation",
+    "authors": [
+      "Kwanghee Choi",
+      "Siyeong Lee"
+    ],
+    "emails": [
+      "~Siyeong_Lee1",
+      "~Kwanghee_Choi1"
+    ],
+    "rank": 1995
+  },
+  {
+    "url": "https://openreview.net/forum?id=hcCao_UYd6O",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.79",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Deep neural networks can now perform many tasks that were once thought to be only feasible for humans. While reaching impressive performance under standard settings, such networks are known to be  susceptible to  adversarial attacks -- slight but carefully constructed perturbations of the inputs which drastically decrease the network performance. Here we propose a new way to improve the network robustness against adversarial attacks by   focusing on robust representation learning based on  adversarial training procedure, called here Adversarial Feature Desensitization (AFD). AFD desensitizes the representation via an adversarial game between the embedding network and an adversarial discriminator introduced on top of  the standard predictive model, which is trained to distinguish between the clean and perturbed inputs from their high-level representations. Our method substantially improves the state-of-the-art in robust classification on MNIST, CIFAR10, and CIFAR100 datasets. More importantly, we demonstrate that AFD has better generalization ability than previous methods, as the learned features maintain their robustness across a wide range of perturbations, including perturbations not seen during training. These results indicate that reducing feature sensitivity is a promising approach for ameliorating the problem of adversarial attacks in deep neural networks. ",
+    "title": "Adversarial Feature Desensitization",
+    "authors": [
+      "Pouya Bashivan",
+      "Mojtaba Faramarzi",
+      "Touraj Laleh",
+      "Blake Aaron Richards",
+      "Irina Rish"
+    ],
+    "emails": [
+      "~Touraj_Laleh1",
+      "~Irina_Rish1",
+      "~Blake_Aaron_Richards1",
+      "~Mojtaba_Faramarzi1",
+      "~Pouya_Bashivan1"
+    ],
+    "rank": 1996
+  },
+  {
+    "url": "https://openreview.net/forum?id=-5VpoDCExrU",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.79",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Log files are files that record events, messages, or transactions. Logs are rich containers of data because they can store a sequence of structured textual and numerical data. Many sequential forms of data including natural languages and temporal signals can be represented as logs.\n\nWe propose to represent logs at a few levels of abstraction including field level, log level, and log sequence level. The representation for each level can be computed from the previous level. These representations are in vector format and serve as interfaces to downstream applications. We use a version of transformer networks to encode numerical information as well as textual information that is suitable for log embedding. We show how a number of log processing applications can be readily solved with our representation.",
+    "title": "Log representation as an interface for log processing applications",
+    "authors": [
+      "Mohammad Amin Sadeghi",
+      "Shameem Parambath",
+      "Ji Lucas",
+      "Youssef Meguebli",
+      "Maguette Toure",
+      "Fawaz Al Qahtani",
+      "Ting Yu",
+      "Sanjay Chawla"
+    ],
+    "emails": [
+      "hbku.edu.qa",
+      "nokia.com",
+      "qf.org.qa",
+      "~Mohammad_Amin_Sadeghi3",
+      "~Sanjay_Chawla2"
+    ],
+    "rank": 1997
+  },
+  {
+    "url": "https://openreview.net/forum?id=WEnXA3sAwV7",
+    "ratings": [
+      5,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.78",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Self-supervised learning (SSL) has shown its powerful ability in discriminative representations for various visual, audio, and video applications. However, most recent works still focus on the different paradigms of spatial-level SSL on video representations. How to self-supervised learn the inherent representation on the temporal dimension is still unrevealed. In this work we propose self-supervised temporal learning (SSTL), aiming at learning spatial-temporal-invariance. Inspired by spatial-based contrastive SSL, we show that significant improvement can be achieved by a proposed temporal-based contrastive learning approach, which includes three novel and efficient modules: temporal augmentations, temporal memory bank and SSTL loss. The temporal augmentations include three operators -- temporal crop, temporal dropout, and temporal jitter. Besides the contrastive paradigm, we observe the temporal contents vary between each layer of the temporal pyramid. The SSTL extends the upper-bound of the current SSL approaches by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>6% on the famous video classification tasks and surprisingly improves the current state-of-the-art approaches by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>100% on some famous video retrieval tasks. The code of SSTL is released with this draft, hoping to nourish the progress of the booming self-supervised learning community.",
+    "title": "Self-supervised Temporal Learning",
+    "authors": [
+      "Hao Shao",
+      "Yu Liu",
+      "Hongsheng Li"
+    ],
+    "emails": [
+      "~Hongsheng_Li3",
+      "~Yu_Liu2",
+      "~Hao_Shao1"
+    ],
+    "rank": 1998
+  },
+  {
+    "url": "https://openreview.net/forum?id=bsRjn0RH620",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.78",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Few-Shot Learning (FSL) is a challenging task of recognizing novel classes from scarce labeled samples. Many existing researches focus on learning good representations that generalize well to new categories. However, given low-data regime, the restricting factors of performance on novel classes has not been well studied. In this paper, our objective is to understand the cause of error in few-shot classification, as well as exploring the upper limit of error rate. We first introduce and derive a theoretical upper bound of error rate which is constrained to 1) linear separability in the learned embedding space and 2) discrepancy of task-specific and task-independent classifier. Quantitative experiment is conducted and results show that the error in FSL is dominantly caused by classifier discrepancy. We further propose a simple method to confirm our theoretical analysis and observation. The method adds a constraint to reduce classifier discrepancy so as to lower the upper bound of error rate. Experiments on three benchmarks with different base learners verify the effectiveness of our method. It shows that decreasing classifier discrepancy can consistently achieve improvements in most cases.",
+    "title": " Towards Understanding the Cause of Error in Few-Shot Learning",
+    "authors": [
+      "Liang Song",
+      "Jinlu Liu",
+      "Yongqiang Qin"
+    ],
+    "emails": [
+      "~Yongqiang_Qin1",
+      "~Jinlu_Liu1",
+      "~Liang_Song1"
+    ],
+    "rank": 1999
+  },
+  {
+    "url": "https://openreview.net/forum?id=HowQIZwD_42",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.78",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Multi-task learning can leverage information learned by one task to benefit the training of other tasks. Despite this capacity, naive formulations often degrade performance and in particular, identifying the tasks that would benefit from co-training remains a challenging design question. In this paper, we analyze the dynamics of information transfer, or transference, across tasks throughout training. Specifically, we develop a similarity measure that can quantify transference among tasks and use this quantity to both better understand the optimization dynamics of multi-task learning as well as improve overall learning performance. In the latter case, we propose two methods to leverage our transference metric. The first operates at a macro-level by selecting which tasks should train together while the second functions at a micro-level by determining how to combine task gradients at each training step. We find these methods can lead to significant improvement over prior work on three supervised multi-task learning benchmarks and one multi-task reinforcement learning paradigm.",
+    "title": "Measuring and Harnessing Transference in Multi-Task Learning",
+    "authors": [
+      "Chris Fifty",
+      "Ehsan Amid",
+      "Zhe Zhao",
+      "Tianhe Yu",
+      "Rohan Anil",
+      "Chelsea Finn"
+    ],
+    "emails": [
+      "~Rohan_Anil1",
+      "~Zhe_Zhao3",
+      "~Chelsea_Finn1",
+      "~Ehsan_Amid1",
+      "~Tianhe_Yu1",
+      "~Chris_Fifty1"
+    ],
+    "rank": 2000
+  },
+  {
+    "url": "https://openreview.net/forum?id=avHr-H-1kEa",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.78",
+    "confidences": [
+      1,
+      2,
+      3,
+      3
+    ],
+    "abstract": "The softmax function combined with a cross-entropy loss is a principled approach to modeling probability distributions that has become ubiquitous in deep learning. The softmax function is defined by a lone hyperparameter, the temperature, that is commonly set to one or regarded as a way to tune model confidence after training; however, less is known about how the temperature impacts training dynamics or generalization performance. In this work we develop a theory of early learning for models trained with softmax-cross-entropy loss and show that the learning dynamics depend crucially on the inverse-temperature <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> as well as the magnitude of the logits at initialization, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D433 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msub><mjx-mo class=\"mjx-n\" noic=\"true\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-b\"><mjx-c class=\"mjx-c1D7D0 TEX-B\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>\u03b2</mi><mrow><mi mathvariant=\"bold\">z</mi></mrow><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msub><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mrow><mn mathvariant=\"bold\">2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container>. We follow up these analytic results with a large-scale empirical study of a variety of model architectures trained on CIFAR10, ImageNet, and IMDB sentiment analysis. We find that generalization performance depends strongly on the temperature, but only weakly on the initial logit magnitude. We provide evidence that the dependence of generalization on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> is not due to changes in model confidence, but is a dynamical phenomenon. It follows that the addition of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> as a tunable hyperparameter is key to maximizing model performance. Although we find the optimal <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> to be sensitive to the architecture, our results suggest that tuning <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> over the range <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mn>10</mn><mrow><mo>\u2212</mo><mn>2</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mn>10</mn><mn>1</mn></msup></math></mjx-assistive-mml></mjx-container>  improves performance over all architectures studied. We find that smaller <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> may lead to better peak performance at the cost of learning stability.",
+    "title": "Temperature check: theory and practice for training models with softmax-cross-entropy losses",
+    "authors": [
+      "Atish Agarwala",
+      "Samuel Stern Schoenholz",
+      "Jeffrey Pennington",
+      "Yann Dauphin"
+    ],
+    "emails": [
+      "~Atish_Agarwala1",
+      "~Yann_Dauphin1",
+      "~Jeffrey_Pennington1",
+      "~Samuel_Stern_Schoenholz1"
+    ],
+    "rank": 2001
+  },
+  {
+    "url": "https://openreview.net/forum?id=_zx8Oka09eF",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      6,
+      5,
+      4
+    ],
+    "rating": "4.78",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Empirical studies demonstrate that the performance of neural networks improves with increasing number of parameters. In most of these studies, the number of parameters is increased by increasing the network width. This begs the question: Is the observed improvement due to the larger number of parameters, or is it due to the larger width itself? We compare different ways of increasing model width while keeping the number of parameters constant. We show that for models initialized with a random, static sparsity pattern in the weight tensors, network width is the determining factor for good performance, while the number of weights is secondary, as long as the model achieves high training accuarcy. As a step towards understanding this effect, we analyze these models in the framework of Gaussian Process kernels. We find that the distance between the sparse finite-width model kernel and the infinite-width kernel at initialization is indicative of model performance.",
+    "title": "Are wider nets better given the same number of parameters?",
+    "authors": [
+      "Anna Golubeva",
+      "Guy Gur-Ari",
+      "Behnam Neyshabur"
+    ],
+    "emails": [
+      "~Behnam_Neyshabur1",
+      "~Anna_Golubeva1",
+      "~Guy_Gur-Ari1"
+    ],
+    "rank": 2002
+  },
+  {
+    "url": "https://openreview.net/forum?id=21aG-pxQWa",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5
+    ],
+    "rating": "4.78",
+    "confidences": [
+      2,
+      4,
+      3
+    ],
+    "abstract": "Machine learning has become more important in real-life decision-making but people are concerned about the ethical problems it may bring when used improperly. Recent work brings the discussion of machine learning fairness into the causal framework and elaborates on the concept of Counterfactual Fairness. In this paper, we develop the Fair Learning through dAta Preprocessing (FLAP) algorithm to learn counterfactually fair decisions from biased training data and formalize the conditions where different data preprocessing procedures should be used to guarantee counterfactual fairness. We also show that Counterfactual Fairness is equivalent to the conditional independence of the decisions and the sensitive attributes given the processed non-sensitive attributes, which enables us to detect discrimination in the original decision using the processed data. The performance of our algorithm is illustrated using simulated data and real-world applications.",
+    "title": "Counterfactual Fairness through Data Preprocessing",
+    "authors": [
+      "Haoyu Chen",
+      "Wenbin Lu",
+      "Rui Song",
+      "Pulak Ghosh"
+    ],
+    "emails": [
+      "~Rui_Song2",
+      "~Haoyu_Chen4",
+      "iimb.ac.in",
+      "~Wenbin_Lu1"
+    ],
+    "rank": 2003
+  },
+  {
+    "url": "https://openreview.net/forum?id=6UurSaf08jx",
+    "ratings": [
+      4,
+      4,
+      6
+    ],
+    "rating": "4.77",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "The advent of the Transformer can arguably be described as a driving force behind many of the recent advances in natural language processing. However, despite their sizeable performance improvements, as recently shown, the model is severely over-parameterized, being parameter inefficient and computationally expensive to train. Inspired by the success of parameter-sharing in pre-trained deep contextualized word representation encoders, we explore parameter-sharing methods in Transformers, with a specific focus on encoder-decoder models for sequence-to-sequence tasks such as Machine Translation. We perform an analysis of different parameter sharing/reduction methods and develop the Subformer, a parameter efficient Transformer-based model which combines the newly proposed Sandwich-style parameter sharing technique and self-attentive embedding factorization (SAFE). Experiments on machine translation, abstractive summarization, and language modeling show that the Subformer can outperform the Transformer even when using significantly fewer parameters. On the WMT'14 English-German test set, we show we can perform equally well, and even sometimes outperform (+0.1 BLEU score) the Transformer-base model while using 40% fewer parameters. We also perform equally well as Transformer-big with 40% fewer parameters, achieve performance within 0.1 BLEU with 70% fewer parameters, and outperform the model by 0.7 BLEU with 12M fewer parameters. We also outperform the standard Transformer-XL model, achieving a significant 3.6 lower perplexity with 37% fewer parameters.",
+    "title": "Subformer: A Parameter Reduced Transformer",
+    "authors": [
+      "Machel Reid",
+      "Edison Marrese-Taylor",
+      "Yutaka Matsuo"
+    ],
+    "emails": [
+      "~Yutaka_Matsuo1",
+      "~Machel_Reid1",
+      "~Edison_Marrese-Taylor2"
+    ],
+    "rank": 2004
+  },
+  {
+    "url": "https://openreview.net/forum?id=tcjMxpMJc95",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.77",
+    "confidences": [
+      3,
+      5,
+      5
+    ],
+    "abstract": "Knowledge distillation (KD), transferring knowledge from a cumbersome teacher model to a lightweight student model, has been investigated to design efficient neural architectures with high accuracy with a few parameters. However, there is a very limited understanding of why and when KD works well. This paper reveals KD's intriguing behaviors, which we believe useful in a better understanding of KD. We first investigate the role of the temperature scaling hyperparameter in KD. It is theoretically shown that the KD loss focuses on the logit vector matching rather than the label matching between the teacher and the student as the temperature grows up.  We also find that KD with a sufficiently large temperature outperforms any other recently modified KD methods from extensive experiments. Based on this observation, we conjecture that the logit vector matching is more important than the label matching. To verify this conjecture, we test an extreme logit learning model, where the KD is implemented with Mean Squared Error (MSE) between the student's logit and the teacher's logit. The KD with MSE consistently shows the best accuracy for various environments. We analyze the different learning behavior of KD with respect to the temperature using a new data uncertainty estimator, coined as Top Logit Difference (TLD). We then study the KD performances for various data sizes. When there are a few data or a few labels, very interestingly, the incapacious teacher with a shallow depth structure facilitates better generalization than teachers having wider and deeper structures. ",
+    "title": "Understanding Knowledge Distillation",
+    "authors": [
+      "Taehyeon Kim",
+      "Jaehoon Oh",
+      "Nakyil Kim",
+      "Sangwook Cho",
+      "Se-Young Yun"
+    ],
+    "emails": [
+      "~Se-Young_Yun1",
+      "~Sangwook_Cho1",
+      "~Nakyil_Kim1",
+      "~Taehyeon_Kim1",
+      "~Jaehoon_Oh1"
+    ],
+    "rank": 2005
+  },
+  {
+    "url": "https://openreview.net/forum?id=gp5Uzbl-9C-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.77",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Inducing causal relationships from observations is a classic problem in machine learning. Most work in causality starts from the premise that the causal variables themselves have known semantics or are observed. However, for AI agents such as robots trying to make sense of their environment, the only observables are low-level variables like pixels in images. To generalize well, an agent must induce high-level variables, particularly those which are causal or are affected by causal variables. A central goal for AI and causality is thus the joint discovery of abstract representations and causal structure. In this work, we systematically evaluate the agent's ability to learn underlying causal structure. We note that existing environments for studying causal induction are poorly suited for this  objective because they have complicated task-specific causal graphs with many confounding factors. Hence, to facilitate research in learning the representation of high-level variables as well as causal structure among these  variables, we present a suite of RL environments created to systematically probe the ability of methods to identify variables as well as causal structure among those variables. We evaluate various representation learning algorithms from literature and found that  explicitly incorporating structure and modularity in the model can help causal induction in model-based reinforcement learning.",
+    "title": "Systematic Evaluation of Causal Discovery in Visual Model Based Reinforcement Learning",
+    "authors": [
+      "Nan Rosemary Ke",
+      "Aniket Rajiv Didolkar",
+      "Sarthak Mittal",
+      "Anirudh Goyal",
+      "Guillaume Lajoie",
+      "Stefan Bauer",
+      "Danilo Jimenez Rezende",
+      "Michael Curtis Mozer",
+      "Yoshua Bengio",
+      "Christopher Pal"
+    ],
+    "emails": [
+      "~Stefan_Bauer1",
+      "~Danilo_Jimenez_Rezende2",
+      "~Yoshua_Bengio1",
+      "~Sarthak_Mittal1",
+      "~Christopher_Pal1",
+      "~Guillaume_Lajoie1",
+      "~Michael_Curtis_Mozer1",
+      "~Anirudh_Goyal1",
+      "~Nan_Rosemary_Ke1",
+      "~Aniket_Rajiv_Didolkar1"
+    ],
+    "rank": 2006
+  },
+  {
+    "url": "https://openreview.net/forum?id=r1j4zl5HsDj",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.77",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "This work proposes a procedure for designing algorithms for specific adaptive data collection tasks like active learning and pure-exploration multi-armed bandits. Unlike the design of traditional adaptive algorithms that rely on concentration of measure and careful analysis to justify the correctness and sample complexity of the procedure, our adaptive algorithm is learned via adversarial training over equivalence classes of problems derived from information theoretic lower bounds. In particular, a single adaptive learning algorithm is learned that competes with the best adaptive algorithm learned for each equivalence class. Our procedure takes as input just the available queries, set of hypotheses, loss function, and total query budget. This is in contrast to existing meta-learning work that learns an adaptive algorithm relative to an explicit, user-defined subset or prior distribution over problems which can be challenging to define and be mismatched to the instance encountered at test time. This work is particularly focused on the regime when the total query budget is very small, such as a few dozen, which is much smaller than those budgets typically considered by theoretically derived algorithms. We perform synthetic experiments to justify the stability and effectiveness of the training procedure, and then evaluate the method on tasks derived from real data including a noisy 20 Questions game and a joke recommendation task.",
+    "title": "Learning to Actively Learn: A Robust Approach",
+    "authors": [
+      "Jifan Zhang",
+      "Kevin Jamieson"
+    ],
+    "emails": [
+      "~Jifan_Zhang1",
+      "~Kevin_Jamieson1"
+    ],
+    "rank": 2007
+  },
+  {
+    "url": "https://openreview.net/forum?id=DdhfDplcxs1",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Learning from label proportions (LLP), where the training data are arranged in form of groups with only label proportions provided instead of the exact labels, is an important weakly supervised learning paradigm in machine learning. Existing deep learning based LLP methods pursue an end-to-end learning fashion and construct the loss using Kullback-Leibler divergence, which measures the difference between the prior and posterior class distributions in each bag. However, unconstrained optimization on this objective can hardly reach a solution in accordance with the given proportions at the bag level. In addition, concerning the probabilistic classifier, it probably results in high-entropy conditional class distributions at the instance level. These issues will further degrade the performance of instance-level classification. To address these problems, we propose to impose the exact proportions on the classifier with a constrained optimization, and firstly apply the optimal transport algorithm to solve LLP. With the entropic regularization, our formulation allows to solve a convex programming efficiently and further arrive at an integer solution that meets the proportion constraint strictly. More importantly, our framework is model-agnostic, and demonstrates compelling performance improvement in extensive experiments, when it is incorporated into other deep LLP models as a post-hoc stage.",
+    "title": "OT-LLP: Optimal Transport for Learning from Label Proportions",
+    "authors": [
+      "Jiabin Liu",
+      "Hanyuan Hang",
+      "Bo Wang",
+      "Xin Shen",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Hanyuan_Hang1",
+      "~Xin_Shen2",
+      "~Jiabin_Liu1",
+      "~Bo_Wang14",
+      "~Zhouchen_Lin1"
+    ],
+    "rank": 2008
+  },
+  {
+    "url": "https://openreview.net/forum?id=YhhEarKSli9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.77",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Learning data representations that capture task-related features, but are invariant to nuisance variations remains a key challenge in machine learning. We introduce an automated Bayesian inference framework, called AutoBayes, that explores different graphical models linking classifier, encoder, decoder, estimator and adversarial network blocks to optimize nuisance-invariant machine learning pipelines. AutoBayes also enables learning disentangled representations, where the latent variable is split into multiple pieces to impose various relationships with the nuisance variation and task labels. We benchmark the framework on several public datasets, and provide analysis of its capability for subject-transfer learning with/without variational modeling and adversarial training. We demonstrate a significant performance improvement with ensemble learning across explored graphical models.",
+    "title": "AutoBayes: Automated Bayesian Graph Exploration for Nuisance-Robust Inference",
+    "authors": [
+      "Andac Demir",
+      "Toshiaki Koike-Akino",
+      "Ye Wang",
+      "Deniz Erdogmus"
+    ],
+    "emails": [
+      "~Ye_Wang2",
+      "~Andac_Demir1",
+      "~Deniz_Erdogmus1",
+      "~Toshiaki_Koike-Akino1"
+    ],
+    "rank": 2009
+  },
+  {
+    "url": "https://openreview.net/forum?id=QKbS9KXkE_y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Hierarchical approaches for reinforcement learning aim to improve data efficiency and accelerate learning by incorporating different abstractions. We introduce Hindsight Off-policy Options (HO2), an efficient off-policy option learning algorithm, and isolate the impact of action and temporal abstraction in the option framework by comparing flat policies, mixture policies without temporal abstraction, and finally option policies; all with comparable policy optimization. When aiming for data efficiency, we demonstrate the importance of off-policy optimization, as even flat policies trained off-policy can outperform on-policy option methods. In addition, off-policy training and backpropagation through a dynamic programming inference procedure -- through time and through the policy components for every time-step -- enable us to train all components' parameters independently of the data-generating behavior policy. We continue to illustrate challenges in off-policy option learning and the related importance of trust-region constraints. Experimentally, we demonstrate that HO2 outperforms existing option learning methods and that both action and temporal abstraction provide strong benefits in particular in more demanding simulated robot manipulation tasks from raw pixel inputs. Finally, we develop an intuitive extension to encourage temporal abstraction and investigate differences in its impact between learning from scratch and using pre-trained options. ",
+    "title": "Data-efficient Hindsight Off-policy Option Learning",
+    "authors": [
+      "Markus Wulfmeier",
+      "Dushyant Rao",
+      "Roland Hafner",
+      "Thomas Lampe",
+      "Abbas Abdolmaleki",
+      "Tim Hertweck",
+      "Michael Neunert",
+      "Dhruva Tirumala",
+      "Noah Yamamoto Siegel",
+      "Nicolas Heess",
+      "Martin Riedmiller"
+    ],
+    "emails": [
+      "~Dhruva_Tirumala1",
+      "~Michael_Neunert1",
+      "~Thomas_Lampe1",
+      "~Dushyant_Rao1",
+      "~Markus_Wulfmeier1",
+      "~Roland_Hafner1",
+      "~Noah_Yamamoto_Siegel1",
+      "~Abbas_Abdolmaleki3",
+      "google.com",
+      "~Nicolas_Heess1",
+      "~Martin_Riedmiller1"
+    ],
+    "rank": 2010
+  },
+  {
+    "url": "https://openreview.net/forum?id=QTgP9nKmMPM",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.77",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Graph Neural Networks (GNNs) become very popular for graph-related applications due to their superior performance. However, they have been shown to be computationally expensive in large scale settings, because their produced node embeddings have to be computed recursively, which scales exponentially with the number of layers. To address this issue, several sampling-based methods have recently been proposed to perform training on a subset of nodes while maintaining the fidelity of the trained model. In this work, we introduce a decoupled greedy learning method for GNNs (DGL-GNN) that, instead of sampling the input graph, decouples the GNN into smaller modules and associates each module with greedy auxiliary objectives. Our approach allows GNN layers to be updated during the training process without waiting for feedback from successor layers, thus making parallel GNN training possible. Our method achieves improved efficiency without significantly compromising model performances, which would be important for time or memory limited applications. Further, we propose a lazy-update scheme during training to further improve its efficiency. We empirically analyse our proposed DGL-GNN model, and demonstrate its effectiveness and superior efficiency through a range of experiments. Compared to the sampling-based acceleration, our model is more stable, and we do not have to trade-off between efficiency and accuracy. Finally, we note that while here we focus on comparing the decoupled approach as an alternative to other methods, it can also be regarded as complementary, for example, to sampling and other scalability-enhancing improvements of GNN training.",
+    "title": "Decoupled Greedy Learning of Graph Neural Networks",
+    "authors": [
+      "YEWEN WANG",
+      "Jian Tang",
+      "Yizhou Sun",
+      "Guy Wolf"
+    ],
+    "emails": [
+      "~Jian_Tang1",
+      "~YEWEN_WANG1",
+      "~Guy_Wolf1",
+      "~Yizhou_Sun1"
+    ],
+    "rank": 2011
+  },
+  {
+    "url": "https://openreview.net/forum?id=F8xpAPm_ZKS",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Credit assignment in reinforcement learning is the problem of measuring an action\u2019s influence on future rewards. \nIn particular, this requires separating \\emph{skill} from \\emph{luck}, ie.\\ disentangling the effect of an action on rewards from that of external factors and subsequent actions. To achieve this, we adapt the notion of counterfactuals from causality theory to a model-free RL setup. \nThe key idea is to condition value functions on \\emph{future} events, by learning to extract relevant information from a trajectory. We then propose to use these as future-conditional baselines and critics in policy gradient algorithms and we develop a valid, practical variant with provably lower variance, while achieving unbiasedness by constraining the hindsight information not to contain information about the agent\u2019s actions. We demonstrate the efficacy and validity of our algorithm on a number of illustrative problems.",
+    "title": "Model-Free Counterfactual Credit Assignment",
+    "authors": [
+      "Thomas Mesnard",
+      "Theophane Weber",
+      "Fabio Viola",
+      "Shantanu Thakoor",
+      "Alaa Saade",
+      "Anna Harutyunyan",
+      "Will Dabney",
+      "Tom Stepleton",
+      "Nicolas Heess",
+      "Marcus Hutter",
+      "Lars Holger Buesing",
+      "Remi Munos"
+    ],
+    "emails": [
+      "~Lars_Holger_Buesing1",
+      "~Remi_Munos1",
+      "~Fabio_Viola2",
+      "~Anna_Harutyunyan1",
+      "google.com",
+      "~Tom_Stepleton1",
+      "~Nicolas_Heess1",
+      "~Will_Dabney1",
+      "~Theophane_Weber1",
+      "~Marcus_Hutter1"
+    ],
+    "rank": 2012
+  },
+  {
+    "url": "https://openreview.net/forum?id=xyEx4_lHqvB",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.77",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We propose an ensemble-based defense against adversarial examples using distance map layers (DMLs). Similar to fully connected layers, DMLs can be used to output logits for a multi-class classification model. We show in this paper how DMLs can be deployed to prevent transferability  of attacks across ensemble members by adapting pairwise (almost) orthogonal covariance matrices. We also illustrate how DMLs provide an efficient way to regularize the Lipschitz constant of the ensemble's member models, which further boosts the resulting robustness. Through empirical evaluations across multiple datasets and attack models, we demonstrate that the ensembles based on DMLs can achieve high benign accuracy while exhibiting robustness against adversarial attacks using multiple white-box techniques along with AutoAttack.",
+    "title": "Ensemble-based Adversarial Defense Using Diversified Distance Mapping",
+    "authors": [
+      "Ehsan Kazemi",
+      "Mohamed E. Hussein",
+      "Wael AbdAlmgaeed"
+    ],
+    "emails": [
+      "~Mohamed_E._Hussein1",
+      "~Wael_AbdAlmgaeed1",
+      "~Ehsan_Kazemi3"
+    ],
+    "rank": 2013
+  },
+  {
+    "url": "https://openreview.net/forum?id=HjD70ArLTQt",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.76",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Although recent complex scene conditional generation models generate increasingly appealing scenes, it is very hard to assess which models perform better and why. This is often due to models being trained to fit different data splits, and defining their own experimental setups. In this paper, we propose a methodology to compare complex scene conditional generation models, and provide an in-depth analysis that assesses the ability of each model to (1) fit the training distribution and hence perform well on seen conditionings, (2) to generalize to unseen conditionings composed of seen object combinations, and (3) generalize to unseen conditionings composed of unseen object combinations. As a result, we observe that recent methods are able to generate recognizable scenes given seen conditionings, and exploit compositionality to generalize to unseen conditionings with seen object combinations. However, all methods suffer from noticeable image quality degradation when asked to generate images from conditionings composed of unseen object combinations. Moreover, through our analysis, we identify the advantages of different pipeline components, and find that (1) encouraging compositionality through instance-wise spatial conditioning normalizations increases robustness to both types of unseen conditionings, (2) using semantically aware losses such as the scene-graph perceptual similarity helps improve some dimensions of the generation process, and (3) enhancing the quality of generated masks and the quality of the individual objects are crucial steps to improve robustness to both types of unseen conditionings.",
+    "title": "Generating unseen complex scenes: are we there yet?",
+    "authors": [
+      "Arantxa Casanova",
+      "Michal Drozdzal",
+      "Adriana Romero"
+    ],
+    "emails": [
+      "~Michal_Drozdzal1",
+      "~Adriana_Romero1",
+      "~Arantxa_Casanova1"
+    ],
+    "rank": 2014
+  },
+  {
+    "url": "https://openreview.net/forum?id=sm2PcsKjm8",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.76",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Class-Incremental Learning (CIL) aims to learn a classification model with the number of classes increasing phase-by-phase. The inherent problem in CIL is the stability-plasticity dilemma between the learning of old and new classes, i.e., high-plasticity models easily forget old classes but high-stability models are weak to learn new classes. We alleviate this issue by proposing a novel network architecture called Meta-Aggregating Networks (MANets) in which we explicitly build two residual blocks at each residual level (taking ResNet as the baseline architecture): a stable block and a plastic block. We aggregate the output feature maps from these two blocks and then feed the results to the next-level blocks. We meta-learn the aggregating weights in order to dynamically optimize and balance between two types of blocks, i.e., between stability and plasticity. We conduct extensive experiments on three CIL benchmarks: CIFAR-100, ImageNet-Subset, and ImageNet, and show that many existing CIL methods can be straightforwardly incorporated on the architecture of MANets to boost their performance. ",
+    "title": "Meta-Aggregating Networks for Class-Incremental Learning",
+    "authors": [
+      "Yaoyao Liu",
+      "Bernt Schiele",
+      "Qianru Sun"
+    ],
+    "emails": [
+      "~Bernt_Schiele1",
+      "~Qianru_Sun2",
+      "~Yaoyao_Liu1"
+    ],
+    "rank": 2015
+  },
+  {
+    "url": "https://openreview.net/forum?id=f3wagFp88BE",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.76",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Batch normalization (BN) has been widely used in modern deep neural networks (DNNs) due to fast convergence. BN is observed to increase the model accuracy while at the cost of adversarial robustness. We conjecture that the increased adversarial vulnerability is caused by BN shifting the model to rely more on non-robust features (NRFs). Our exploration finds that other normalization techniques also increase adversarial vulnerability and our conjecture is also supported by analyzing the model corruption robustness and feature transferability. With a classifier DNN defined as a feature set <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> we propose a framework for disentangling <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> robust usefulness into <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> usefulness and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> robustness. We adopt a local linearity based metric, termed LIGS, to define and quantify <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> robustness. Measuring the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> robustness with the LIGS provides direct insight on the feature robustness shift independent of usefulness. Moreover, the LIGS trend during the whole training stage sheds light on the order of learned features, \\ie from RFs (robust features) to NRFs, or vice versa. Our work analyzes how BN and other factors influence the DNN from the feature perspective. Prior works mainly adopt accuracy to evaluate their influence regarding <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> usefulness, while we believe evaluating <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D439 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>F</mi></math></mjx-assistive-mml></mjx-container> robustness is equally important, for which our work fills the gap. ",
+    "title": "Batch Normalization Increases Adversarial Vulnerability: Disentangling Usefulness and Robustness of Model Features",
+    "authors": [
+      "Philipp Benz",
+      "Chaoning Zhang",
+      "In So Kweon"
+    ],
+    "emails": [
+      "~Philipp_Benz1",
+      "~In_So_Kweon2",
+      "~Chaoning_Zhang1"
+    ],
+    "rank": 2016
+  },
+  {
+    "url": "https://openreview.net/forum?id=ascdLuNQY4J",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.76",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "An important goal of neural architecture search (NAS) is to automate-away the design of neural networks on new tasks in under-explored domains, thus helping to democratize machine learning. However, current NAS research largely focuses on search spaces consisting of existing operations---such as different types of convolution---that are already known to work well on well-studied problems---often in computer vision. Our work is motivated by the following question: can we enable users to build their own search spaces and discover the right neural operations given data from their specific domain? We make progress towards this broader vision for NAS by introducing a space of operations generalizing the convolution that enables search over a large family of parameterizable linear-time matrix-vector functions. Our flexible construction allows users to design their own search spaces adapted to the nature and shape of their data, to warm-start search methods using convolutions when they are known to perform well, or to discover new operations from scratch when they do not. We evaluate our approach on several novel search spaces over vision and text data, on all of which simple NAS search algorithms can find operations that perform better than baseline layers.",
+    "title": "Searching for Convolutions and a More Ambitious NAS",
+    "authors": [
+      "Nicholas Carl Roberts",
+      "Mikhail Khodak",
+      "Tri Dao",
+      "Liam Li",
+      "Nina Balcan",
+      "Christopher Re",
+      "Ameet Talwalkar"
+    ],
+    "emails": [
+      "~Nicholas_Carl_Roberts1",
+      "~Christopher_Re1",
+      "~Tri_Dao1",
+      "~Mikhail_Khodak1",
+      "~Nina_Balcan1",
+      "~Liam_Li1",
+      "~Ameet_Talwalkar1"
+    ],
+    "rank": 2017
+  },
+  {
+    "url": "https://openreview.net/forum?id=5vShUEyjmm",
+    "ratings": [
+      5,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.76",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "We present a new generative autoencoder model with dual contradistinctive losses to improve generative autoencoder that performs simultaneous inference (reconstruction) and synthesis (generation). We name our model dual contradistinctive generative autoencoder (DC-VAE) that integrates an instance-level discriminative loss (maintaining the instance-level fidelity for the reconstruction/synthesis) with a set-level adversarial loss (encouraging the set-level fidelity for the reconstruction/synthesis), both being contradistinctive.  There also exists a mathematical connection between the instance-based classification and instance-level conditional distribution. DC-VAE achieves competitive results in three tasks, including image synthesis, image reconstruction, and representation learning. DC-VAE is applicable to various tasks in computer vision and machine learning.",
+    "title": "Dual Contradistinctive Generative Autoencoder",
+    "authors": [
+      "Gaurav Parmar",
+      "Dacheng Li",
+      "Kwonjoon Lee",
+      "Zhuowen Tu"
+    ],
+    "emails": [
+      "~Dacheng_Li1",
+      "~Zhuowen_Tu1",
+      "~Gaurav_Parmar1",
+      "~Kwonjoon_Lee1"
+    ],
+    "rank": 2018
+  },
+  {
+    "url": "https://openreview.net/forum?id=uqD-un_Mzd-",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.76",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Graph Convolutional Neural Networks (GCNs) exploit convolution operators, based on some neighborhood aggregating scheme, to compute representations of graphs. The most common convolution operators only exploit local topological information. To consider wider topological receptive fields, the mainstream approach is to non-linearly stack multiple Graph Convolutional (GC) layers. In this way, however, interactions among GC parameters at different levels pose a bias on the flow of topological information. In this paper,  we propose a different strategy, considering a single graph convolution layer that independently exploits neighbouring nodes at different topological distances, generating decoupled representations for each of them. These representations are then  processed by subsequent readout layers. We implement this strategy introducing the Polynomial Graph Convolution (PGC) layer, that we prove being more expressive than the most common convolution operators and their linear stacking. Our contribution is not limited to the definition of a convolution operator with a larger receptive field, but we prove both theoretically and experimentally that the common way  multiple non-linear graph convolutions are stacked limits the neural network expressiveness. Specifically, we show that a Graph Neural Network architecture with a single PGC layer achieves state of the art performance on many commonly adopted graph classification benchmarks.",
+    "title": "Polynomial Graph Convolutional Networks",
+    "authors": [
+      "Luca Pasa",
+      "Nicol\u00f2 Navarin",
+      "Alessandro Sperduti"
+    ],
+    "emails": [
+      "~Alessandro_Sperduti1",
+      "~Luca_Pasa1",
+      "~Nicol\u00f2_Navarin1"
+    ],
+    "rank": 2019
+  },
+  {
+    "url": "https://openreview.net/forum?id=qfkz7wHXKC",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.76",
+    "confidences": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Transformer has been widely adopted in Neural Machine Translation (NMT) because of its large capacity and parallel training of sequence generation. However, the deployment of Transformer is challenging because different use scenarios require models of different complexities and scales. Naively training multiple Transformers of different scales is redundant in terms of both computation and memory. In this paper, we propose a novel scalable Transformers, which naturally contains sub-Transformers of different scales and have shared parameters. Each sub-Transformer can be easily obtained by cropping the parameters of the largest Transformer. A three-stage training scheme is proposed to tackle the difficulty of training the scalable Transformers, which introduces additional supervisions from word-level and sequence-level self-distillation. Extensive experiments were conducted on WMT EN-De and En-Fr to validate the effectiveness of the proposed scalable Transformers.",
+    "title": "Scalable Transformers for Neural Machine Translation",
+    "authors": [
+      "Gao Peng",
+      "Shijie Geng",
+      "Xiaogang Wang",
+      "Jifeng Dai",
+      "Hongsheng Li"
+    ],
+    "emails": [
+      "~Jifeng_Dai1",
+      "~Gao_Peng1",
+      "~Xiaogang_Wang2",
+      "~Hongsheng_Li3",
+      "~Shijie_Geng1"
+    ],
+    "rank": 2020
+  },
+  {
+    "url": "https://openreview.net/forum?id=6FsCHsZ66Fp",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.76",
+    "confidences": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "abstract": "It is well-known that standard neural networks, even with a high classification accuracy, are vulnerable to small <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> perturbations. Many attempts have been tried to learn a network that can resist such adversarial attacks. However, most previous works either can only provide empirical verification of the defense to a particular attack method or can only develop a theoretical guarantee of the model robustness in limited scenarios. In this paper, we develop a theoretically principled neural network that inherently resists <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> perturbations. In particular, we design a novel neuron that uses <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> distance as its basic operation, which we call <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container>-dist neuron. We show that the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container>-dist neuron is naturally a 1-Lipschitz function with respect to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> norm, and the neural networks constructed with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container>-dist neuron (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>-dist Nets) enjoy the same property. This directly provides a theoretical guarantee of the certified robustness based on the margin of the prediction outputs. We further prove that the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>-dist Nets have enough expressiveness power to approximate any 1-Lipschitz function, and can generalize well as the robust test error can be upper-bounded by the performance of a large margin classifier on the training data. Preliminary experiments show that even without the help of adversarial training, the learned networks with high classification accuracy are already provably robust.",
+    "title": "Towards certifying \u2113\u221e robustness using Neural networks with \u2113\u221e-dist Neurons",
+    "authors": [
+      "Bohang Zhang",
+      "Zhou Lu",
+      "Tianle Cai",
+      "Di He",
+      "Liwei Wang"
+    ],
+    "emails": [
+      "~Di_He1",
+      "pku.edu.cn",
+      "~Zhou_Lu1",
+      "~Liwei_Wang1",
+      "~Tianle_Cai1"
+    ],
+    "rank": 2021
+  },
+  {
+    "url": "https://openreview.net/forum?id=RDpTZpubOh7",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "As reinforcement learning agents become increasingly integrated into complex, real-world environments, designing for safety becomes a critical consideration. We specifically focus on researching scenarios where agents can cause undesired side effects while executing a policy on a primary task. Since one can define multiple tasks for a given environment dynamics, there are two important challenges. First, we need to abstract the concept of safety that applies broadly to that environment independent of the specific task being executed. Second, we need a mechanism for the abstracted notion of safety to modulate the actions of agents executing different policies to minimize their side-effects. In this work, we propose Safety Aware Reinforcement Learning (SARL) - a framework where a virtual safe agent modulates the actions of a main reward-based agent to minimize side effects. The safe agent learns a task-independent notion of safety for a given environment. The main agent is then trained with a regularization loss given by the distance between the native action probabilities of the two agents. Since the safe agent effectively abstracts a task-independent notion of safety via its action probabilities, it can be ported to modulate multiple policies solving different tasks within the given environment without further training. We contrast this with solutions that rely on task-specific regularization metrics and test our framework on the SafeLife Suite, based on Conway's Game of Life, comprising a number of complex tasks in dynamic environments. We show that our solution is able to match the performance of solutions that rely on task-specific side-effect penalties on both the primary and safety objectives while additionally providing the benefit of generalizability and portability. ",
+    "title": "Safety Aware Reinforcement Learning (SARL)",
+    "authors": [
+      "Santiago Miret",
+      "Somdeb Majumdar",
+      "Carroll Wainwright"
+    ],
+    "emails": [
+      "partnershiponai.org",
+      "~Somdeb_Majumdar1",
+      "~Santiago_Miret1"
+    ],
+    "rank": 2022
+  },
+  {
+    "url": "https://openreview.net/forum?id=UoAFJMzCNM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "   In this paper we present a deep learning framework for solving large-scale multi-agent non-cooperative stochastic games using fictitious play. The Hamilton-Jacobi-Bellman (HJB)  PDE associated with each agent is reformulated into a set of Forward-Backward Stochastic Differential Equations (FBSDEs) and solved via forward sampling on a suitably defined neural network architecture. Decision-making in multi-agent systems suffers from the curse of dimensionality and strategy degeneration as the number of agents and time horizon increase. We propose a novel Deep FBSDE controller framework which is shown to outperform the current state-of-the-art deep fictitious play algorithm on a high dimensional inter-bank lending/borrowing problem. More importantly, our approach mitigates the curse of many agents and reduces computational and memory complexity, allowing us to scale up to 1,000 agents in simulation, a scale which, to the best of our knowledge, represents a new state of the art.  Finally, we showcase the framework's applicability in robotics on a belief-space autonomous racing problem.",
+    "title": "Multi-agent Deep FBSDE Representation For Large Scale  Stochastic Differential Games",
+    "authors": [
+      "Tianrong Chen",
+      "Ziyi Wang",
+      "Ioannis Exarchos",
+      "Evangelos Theodorou"
+    ],
+    "emails": [
+      "~Tianrong_Chen1",
+      "~Evangelos_Theodorou1",
+      "~Ioannis_Exarchos1",
+      "~Ziyi_Wang1"
+    ],
+    "rank": 2023
+  },
+  {
+    "url": "https://openreview.net/forum?id=3-a23gHXQmr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "In parametric density estimation, the parameters of a known probability density are typically recovered from measurements by maximizing the log-likelihood. Prior knowledge of measurement uncertainties is not included in this method -- potentially producing degraded or even biased parameter estimates.\nWe propose an efficient two-step, general-purpose approach for parametric density estimation using deep ensembles. \nFeature predictions and their uncertainties are returned by a deep ensemble and then combined in an importance weighted maximum likelihood estimation to recover parameters representing a known density along with their respective errors. To compare the bias-variance tradeoff of different approaches, we define an appropriate figure of merit.\nWe illustrate a number of use cases for our method in the physical sciences and demonstrate state-of-the-art results for X-ray polarimetry that outperform current classical and deep learning methods.",
+    "title": "Parametric Density Estimation with Uncertainty using Deep Ensembles",
+    "authors": [
+      "Abel Peirson",
+      "Taylor Howell",
+      "Marius Aurel Tirlea"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Abel_Peirson1"
+    ],
+    "rank": 2024
+  },
+  {
+    "url": "https://openreview.net/forum?id=YjXnezbeCwG",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Simultaneous neural machine translation (briefly, NMT) has attracted much attention recently. In contrast to standard NMT, where the NMT system can access the full input sentence, simultaneous NMT is a prefix-to-prefix problem, where the system can only utilize the prefix of the input sentence and thus more uncertainty and difficulty are introduced to decoding. Wait-k inference is a simple yet effective strategy for simultaneous NMT, where the decoder generates the output sequence <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> words behind the input words. For wait-k inference, we observe that wait-m training with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3E\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>m</mi><mo>&gt;</mo><mi>k</mi></math></mjx-assistive-mml></mjx-container> in simultaneous NMT (i.e., using more future information for training than inference) generally outperforms wait-k training. Based on this observation, we propose a method that automatically learns how much future information to use in training for simultaneous NMT. Specifically, we introduce a controller to adaptively select wait-m training strategies according to the network status of the translation model and current training sentence pairs, and the controller is jointly trained with the translation model through bi-level optimization. Experiments on four datasets show that our method brings 1 to 3 BLEU point improvement over baselines under the same latency. Our code is available at <a href=\"https://github.com/P2F-research/simulNMT\" target=\"_blank\" rel=\"nofollow\">https://github.com/P2F-research/simulNMT</a> .",
+    "title": "Learning to Use Future Information in Simultaneous Translation",
+    "authors": [
+      "Xueqing Wu",
+      "Yingce Xia",
+      "Lijun Wu",
+      "Shufang Xie",
+      "Weiqing Liu",
+      "Tao Qin",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Shufang_Xie1",
+      "microsoft.com",
+      "~Tie-Yan_Liu1",
+      "~Lijun_Wu1",
+      "~Xueqing_Wu1",
+      "~Yingce_Xia1"
+    ],
+    "rank": 2025
+  },
+  {
+    "url": "https://openreview.net/forum?id=9sF3n8eAco",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      7,
+      3,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Modern deep neural network (DNN) models generally require a huge amount of weight and activation values to achieve good inference outcomes. Those data inevitably demand a massive off-chip memory capacity/bandwidth, and the situation gets even worse if they are represented in high-precision floating-point formats. Effort has been made for representing those data in different 8-bit floating-point formats, nevertheless, a notable accuracy loss is still unavoidable. In this paper we introduce an extremely flexible 8-bit floating-point (FFP8) format whose defining factors \u2013 the bit width of exponent/fraction field, the exponent bias, and even the presence of the sign bit \u2013 are all configurable. We also present a methodology to properly determine those factors so that the accuracy of model inference can be maximized. The foundation of this methodology is based on a key observation \u2013 both the maximum magnitude and the value distribution are quite dissimilar between weights and activations in most DNN models. Experimental results demonstrate that the proposed FFP8 format achieves an extremely low accuracy loss of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.1</mn><mi mathvariant=\"normal\">%</mi><mo>\u223c</mo><mn>0.3</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for several representative image classification models even without the need of model retraining. Besides, it is easy to turn a classical floating-point processing unit into an FFP8-compliant one, and the extra hardware cost is minor.",
+    "title": "All-You-Can-Fit 8-Bit Flexible Floating-Point Format for Accurate and Memory-Efficient Inference of Deep Neural Networks",
+    "authors": [
+      "Juinn-Dar Huang",
+      "Cheng-Wei Huang",
+      "Tim-Wei Chen"
+    ],
+    "emails": [
+      "~Cheng-Wei_Huang1",
+      "~Juinn-Dar_Huang1",
+      "~Tim-Wei_Chen1"
+    ],
+    "rank": 2026
+  },
+  {
+    "url": "https://openreview.net/forum?id=Im43P9kuaeP",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Watermarking is a commonly used strategy to protect creators' rights to digital images, videos and audio. Recently, watermarking methods have been extended to deep learning models -- in principle, the watermark should be preserved when an adversary tries to copy the model. However, in practice, watermarks can often be removed by an intelligent adversary. Several papers have proposed watermarking methods that claim to be empirically resistant to different types of removal attacks, but these new techniques often fail in the face of new or better-tuned adversaries. In this paper, we propose the first certifiable watermarking method. Using the randomized smoothing technique proposed in Chiang et al., we show that our watermark is guaranteed to be unremovable unless the model parameters are changed by more than a certain <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> threshold. In addition to being certifiable, our watermark is also empirically more robust compared to previous watermarking methods.",
+    "title": "Certified Watermarks for Neural Networks",
+    "authors": [
+      "Arpit Amit Bansal",
+      "Ping-yeh Chiang",
+      "Michael Curry",
+      "Hossein Souri",
+      "Rama Chellappa",
+      "John P Dickerson",
+      "Rajiv Jain",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Rajiv_Jain1",
+      "~John_P_Dickerson1",
+      "~Ping-yeh_Chiang1",
+      "~Rama_Chellappa1",
+      "~Michael_Curry2",
+      "~Tom_Goldstein1",
+      "~Arpit_Amit_Bansal1",
+      "~Hossein_Souri1"
+    ],
+    "rank": 2027
+  },
+  {
+    "url": "https://openreview.net/forum?id=nuwy7R_kemM",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We detect out-of-training-distribution sentences in Neural Machine Translation using the Bayesian Deep Learning equivalent of Transformer models. For this we develop a new measure of uncertainty designed specifically for long sequences of discrete random variables\u2014i.e. words in the output sentence. Our new measure of uncertainty solves a major intractability in the naive application of existing approaches on long sentences. We use our new measure on a Transformer model trained  with  dropout  approximate  inference.   On  the  task  of  German-English translation using WMT13 and Europarl, we show that with dropout uncertainty our measure is able to identify when Dutch source sentences, sentences which use the same word types as German, are given to the model instead of German.",
+    "title": "Wat zei je? Detecting Out-of-Distribution Translations with Variational Transformers",
+    "authors": [
+      "Tim Z. Xiao",
+      "Aidan Gomez",
+      "Yarin Gal"
+    ],
+    "emails": [
+      "~Aidan_Gomez1",
+      "~Yarin_Gal1",
+      "~Tim_Z._Xiao1"
+    ],
+    "rank": 2028
+  },
+  {
+    "url": "https://openreview.net/forum?id=QtLSvKvm5Po",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "In this work, we propose the first backdoor attack to graph neural networks (GNN). Specifically, we propose a \\emph{subgraph based backdoor attack} to GNN for graph classification.  In our backdoor attack, a GNN classifier predicts an attacker-chosen target label for a testing graph once  a predefined subgraph is injected to the testing graph. Our empirical results on three real-world graph datasets show that our backdoor attacks are effective with a small impact on a GNN's prediction accuracy for clean testing graphs.",
+    "title": "Backdoor Attacks to Graph Neural Networks",
+    "authors": [
+      "Zaixi Zhang",
+      "Jinyuan Jia",
+      "Binghui Wang",
+      "Neil Zhenqiang Gong"
+    ],
+    "emails": [
+      "~Neil_Zhenqiang_Gong1",
+      "duke.edu",
+      "~Binghui_Wang2",
+      "~Jinyuan_Jia2"
+    ],
+    "rank": 2029
+  },
+  {
+    "url": "https://openreview.net/forum?id=OcTUl1kc_00",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph Convolutional Networks (GCNs) represent the state-of-the-art for many graph related tasks. At every layer, GCNs rely on the graph structure to define an aggregation strategy where each node updates its representation by combining information from its neighbours. A known limitation of GCNs is their inability to infer long-range dependencies. In fact, as the number of layers increases, information gets smoothed and node embeddings become indistinguishable, negatively affecting performance. In this paper we formalize four levels of injection of graph structural information, and use them to analyze the importance of long-range dependencies. We then propose a novel regularization technique based on random walks with restart, called RWRReg, which encourages the network to encode long-range information into node embeddings. RWRReg does not require additional operations at inference time, is model-agnostic, and is further supported by our theoretical analysis connecting it to the Weisfeiler-Leman algorithm. Our experimental analysis, on both transductive and inductive tasks, shows that the lack of long-range structural information greatly affects the performance of state-of-the-art models, and that the long-range information exploited by RWRReg leads to an average accuracy improvement of more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>5</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on all considered tasks.",
+    "title": "Are Graph Convolutional Networks Fully Exploiting the Graph Structure?",
+    "authors": [
+      "Davide Buffelli",
+      "Fabio Vandin"
+    ],
+    "emails": [
+      "~Fabio_Vandin2",
+      "~Davide_Buffelli1"
+    ],
+    "rank": 2030
+  },
+  {
+    "url": "https://openreview.net/forum?id=7K0UUL9y9lE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      2
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Transformer-based models have come to dominate the landscape in a wide range of natural language processing (NLP) applications. The heart of the transformer model is the self-attention mechanism, which captures the interactions of token pairs in the input sequences and consequently, depends quadratically on the input sequence length. It is known that training such models on longer sequences is quite expensive, and often, prohibitively so. We show that a Bernoulli sampling attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity to linear. We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of LSH (based on feasibility of deployment on GPU architectures). We evaluate our proposed algorithm on the GLUE benchmark with standard 512 sequence length and our method achieves comparable or even slightly better performance than a standard pretrained Transformer. To evaluate whether our method can indeed handle longer sequences, we conduct experiments on long sequence (4096) language model pretraining and achieve consistent results as standard self-attention, while observing sizable inference speed-ups and memory savings.",
+    "title": "You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling",
+    "authors": [
+      "Zhanpeng Zeng",
+      "Yunyang Xiong",
+      "Sathya N. Ravi",
+      "Shailesh Acharya",
+      "Glenn Fung",
+      "Vikas Singh"
+    ],
+    "emails": [
+      "~Yunyang_Xiong2",
+      "amfam.com",
+      "~Zhanpeng_Zeng1",
+      "~Vikas_Singh1",
+      "~Sathya_N._Ravi1",
+      "~Glenn_Fung2"
+    ],
+    "rank": 2031
+  },
+  {
+    "url": "https://openreview.net/forum?id=vttv9ADGuWF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper studies a certifiable defense against adversarial patch attacks on image classification. Our approach classifies random crops from the original image independently and the original image is classified as the vote over these crops.  This process minimizes changes to the training process, as only the crop classification model needs to be trained, and can be trained in a standard manner without explicit adversarial training.  Leveraging the fact that a patch attack can only influence some pixels of the image, we derive certified robustness bounds on the resulting classification. Our method is particularly effective when realistic physical transformations are applied to the adversarial patch, such as affine transformations. Such transformations occur naturally when an adversarial patch is physically introduced to a scene. Our method improves upon the current state of the art in defending against patch attacks on CIFAR10 and ImageNet, both in terms of certified accuracy and inference time.",
+    "title": "Certified robustness against physically-realizable patch attack via randomized cropping",
+    "authors": [
+      "Wan-Yi Lin",
+      "Fatemeh Sheikholeslami",
+      "jinghao shi",
+      "Leslie Rice",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Fatemeh_Sheikholeslami1",
+      "~J_Zico_Kolter1",
+      "~Wan-Yi_Lin1",
+      "~jinghao_shi1",
+      "~Leslie_Rice1"
+    ],
+    "rank": 2032
+  },
+  {
+    "url": "https://openreview.net/forum?id=GRbZ91LKIya",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Building video and language understanding models requires grounding linguistic concepts and video contents into a shared space. Most of previous works learn a holistic alignment between them while neglecting the token-level grounding. Masked token prediction can be used to learn token-level multi-modal representation, but it does not necessarily force lexical grounding on perception and also introduce a domain-shift between pretraining and fine-tuning. This paper introduces a simple token-level contrastive loss (ToCo) informed by syntactic classes (e.g., nouns and verbs) to force the model to prioritize grounding concrete semantic bearing words. ToCo does not mask inputs but poses both local (contextual token) and global (lexical type) pressures for multi-modal alignment in a contrastive manner. Our approach enables a simple vanilla BERT-based multimodal transformer to compete with or outperform existing heavily engineered multi-loss or large models on three benchmarks (YouCook2, MSR-VTT and CrossTask). Further, it is plug-n-play such that gains are made in both pretraining and downstream tasks solely, regardless of the underlying visual or textual feature representations. ",
+    "title": "Token-Level Contrast for Video and Language Alignment",
+    "authors": [
+      "Jianwei Yang",
+      "Yonatan Bisk",
+      "Jianfeng Gao"
+    ],
+    "emails": [
+      "~Yonatan_Bisk1",
+      "~Jianwei_Yang1",
+      "~Jianfeng_Gao1"
+    ],
+    "rank": 2033
+  },
+  {
+    "url": "https://openreview.net/forum?id=Twf5rUVeU-I",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "First-order stochastic methods for solving large-scale non-convex optimization problems are widely used in many big-data applications, e.g. training deep neural networks as well as other complex and potentially non-convex machine learning\nmodels. Their inexpensive iterations generally come together with slow global convergence rate (mostly sublinear), leading to the necessity of carrying out a very high number of iterations before the iterates reach a neighborhood of a minimizer. In this work, we present a first-order stochastic algorithm based on a combination of homotopy methods and SGD, called Homotopy-Stochastic Gradient Descent (H-SGD), which finds interesting connections with some proposed heuristics in the literature, e.g. optimization by Gaussian continuation, training by diffusion, mollifying networks. Under some mild and realistic assumptions on the problem structure, we conduct a theoretical analysis of the proposed algorithm. Our analysis shows that, with a specifically designed scheme for the homotopy parameter, H-SGD enjoys a global linear rate of convergence to a neighborhood of a minimizer while maintaining fast and inexpensive iterations. Experimental evaluations confirm the theoretical results and show that H-SGD can outperform standard SGD.",
+    "title": "Convergence Analysis of Homotopy-SGD for Non-Convex Optimization",
+    "authors": [
+      "Matilde Gargiani",
+      "Andrea Zanelli",
+      "Moritz Diehl",
+      "Quoc Tran-Dinh",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "~Andrea_Zanelli1",
+      "~Quoc_Tran-Dinh2",
+      "~Frank_Hutter1",
+      "~Moritz_Diehl1",
+      "~Matilde_Gargiani1"
+    ],
+    "rank": 2034
+  },
+  {
+    "url": "https://openreview.net/forum?id=V4AVDoFtVM",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      7
+    ],
+    "rating": "4.75",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The value function lies in the heart of Reinforcement Learning (RL), which defines the long-term evaluation of a policy in a given state. In this paper, we propose Policy-extended Value Function Approximator (PeVFA) which extends the conventional value to be not only a function of state but also an explicit policy representation. Such an extension enables PeVFA to preserve values of multiple policies in contrast to a conventional one with limited capacity for only one policy, inducing the new characteristic of \\emph{value generalization among policies}. From both the theoretical and empirical lens, we study value generalization along the policy improvement path (called local generalization), from which we derive a new form of Generalized Policy Iteration with PeVFA to improve the conventional learning process. Besides, we propose a framework to learn the representation of an RL policy,  studying several different approaches to learn an effective policy representation from policy network parameters and state-action pairs through contrastive learning and action prediction. In our experiments, Proximal Policy Optimization (PPO) with PeVFA significantly outperforms its vanilla counterpart in MuJoCo continuous control tasks, demonstrating the effectiveness of value generalization offered by PeVFA and policy representation learning. ",
+    "title": "What About Taking Policy as Input of Value Function: Policy-extended Value Function Approximator",
+    "authors": [
+      "Hongyao Tang",
+      "Zhaopeng Meng",
+      "Jianye HAO",
+      "Chen Chen",
+      "Daniel Graves",
+      "Dong Li",
+      "Wulong Liu",
+      "Yaodong Yang"
+    ],
+    "emails": [
+      "~Hongyao_Tang1",
+      "~Wulong_Liu1",
+      "~Chen_Chen3",
+      "~Jianye_HAO1",
+      "~Zhaopeng_Meng1",
+      "huawei.com",
+      "~Daniel_Graves1"
+    ],
+    "rank": 2035
+  },
+  {
+    "url": "https://openreview.net/forum?id=4VixXVZJkoY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a new model to refine image-to-image translation via an adversarial ranking process. In particular, we simultaneously train two modules: a generator that translates an input image to the desired image with smooth subtle changes with respect to some specific attributes; and a ranker that ranks rival preferences consisting of the input image and the desired image. Rival preferences refer to the adversarial ranking process: (1) the ranker thinks no difference between the desired image and the input image in terms of the desired attributes; (2) the generator fools the ranker to believe that the desired image changes the attributes over the input image as desired. Real image preferences are introduced to guide the ranker to rank image pairs regarding the interested attributes only. With an effective ranker, the generator would \u201cwin\u201d the adversarial game by producing high-quality images that present desired changes over the attributes compared to the input image. The experiments demonstrate that our TRIP can generate high-fidelity images which exhibit smooth changes with the strength of the attributes.",
+    "title": "TRIP: Refining Image-to-Image Translation via Rival Preferences",
+    "authors": [
+      "Yinghua Yao",
+      "Yuangang Pan",
+      "Ivor Tsang",
+      "Xin Yao"
+    ],
+    "emails": [
+      "~Ivor_Tsang1",
+      "~Yinghua_Yao1",
+      "~Yuangang_Pan2",
+      "~Xin_Yao1"
+    ],
+    "rank": 2036
+  },
+  {
+    "url": "https://openreview.net/forum?id=Og7kVwRVStV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural networks are known to be vulnerable to adversarial attacks - small, imperceptible perturbations that cause the network to misclassify an input. A recent line of work attempts to explain this behavior by positing the existence of non-robust features - well-generalizing but brittle features present in the data distribution that are learned by the network and can be perturbed to cause misclassification.\n\nIn this paper, we look at the dynamics of neural network training through the perspective of robust and non-robust features. We find that there are two very distinct pathways that neural network training can follow, depending on the hyperparameters used. In the first pathway, the network initially learns only predictive, robust features and weakly predictive non-robust features, and subsequently learns predictive, non-robust features. On the other hand, a network trained via the second pathway eschews predictive non-robust features altogether, and rapidly overfits the training data. We provide strong empirical evidence to corroborate this hypothesis, as well as theoretical analysis in a simplified setting. Key to our analysis is a better understanding of the relationship between predictive non-robust features and adversarial transferability. We present our findings in light of other recent results on the evolution of inductive biases learned by neural networks over the course of training.\n\nFinally, we digress to show that rather than being quirks of the data distribution, predictive non-robust features might actually occur across datasets with different distributions drawn from independent sources, indicating that they perhaps possess some meaning in terms of human semantics.",
+    "title": "SGD on Neural Networks learns Robust Features before Non-Robust",
+    "authors": [
+      "Vikram Nitin"
+    ],
+    "emails": [
+      "~Vikram_Nitin1"
+    ],
+    "rank": 2037
+  },
+  {
+    "url": "https://openreview.net/forum?id=aeA3LLDDQe2",
+    "ratings": [
+      5,
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.75",
+    "confidences": [
+      2,
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Missing data are prevalent and present daunting challenges in real data analysis. While there is a growing body of literature on fairness in analysis of fully observed data, there has been little work on investigating fairness in analysis of incomplete data when the goal is to develop a fair algorithm in the complete data domain where there are no missing values. In practice, a popular analytical approach for dealing with missing data is to use only the set of complete cases, i.e., observations with all features fully observed, as a representation of complete data in learning. However, depending on the missing data mechanism,  the complete case domain and the complete data domain may have different data distributions and a fair algorithm in the complete case domain may show disproportionate bias towards some marginalized groups in the complete data domain. To fill this significant gap, we studying the problem of estimating fairness in the complete data domain for a model trained using observed data and evaluated in the complete case domain. We provide upper and lower bounds on the fairness estimation error and conduct numerical experiments to assess our theoretical results. Our work provides the first known results on fairness guarantee in analysis of incomplete data.",
+    "title": "Fairness guarantee in analysis of incomplete data",
+    "authors": [
+      "Yiliang Zhang",
+      "Qi Long"
+    ],
+    "emails": [
+      "upenn.edu",
+      "~Yiliang_Zhang1"
+    ],
+    "rank": 2038
+  },
+  {
+    "url": "https://openreview.net/forum?id=cQmVjJ-4eXq",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "The basic framework of depth completion is to predict a pixel-wise dense depth map using very sparse input data.\nIn this paper, we try to solve this problem in a more effective way, by reformulating the regression-based depth estimation problem into a combination of depth plane classification and residual regression.\nOur proposed approach is to initially densify sparse depth information by figuring out which plane a pixel should lie among a number of discretized depth planes, and then calculate the final depth value by predicting the distance from the specified plane.\nThis will help the network to lessen the burden of directly regressing the absolute depth information from none, and to effectively obtain more accurate depth prediction result with less computation power and inference time.\nTo do so, we firstly introduce a novel way of interpreting depth information with the closest depth plane label <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi></math></mjx-assistive-mml></mjx-container> and a residual value <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>r</mi></math></mjx-assistive-mml></mjx-container>, as we call it, Plane-Residual (PR) representation.\nWe also propose a depth completion network utilizing PR representation consisting of a shared encoder and two decoders, where one classifies the pixel's depth plane label and the other one regresses the normalized distance from the classified depth plane.\nBy interpreting depth information in PR representation and using our corresponding depth completion network, we were able to acquire improved depth completion performance with faster computation, comparing to other recent approaches.",
+    "title": "Depth Completion using Plane-Residual Representation",
+    "authors": [
+      "Byeong-Uk Lee",
+      "Kyunghyun Lee",
+      "In So Kweon"
+    ],
+    "emails": [
+      "~In_So_Kweon2",
+      "~Kyunghyun_Lee2",
+      "~Byeong-Uk_Lee1"
+    ],
+    "rank": 2039
+  },
+  {
+    "url": "https://openreview.net/forum?id=A993YzEUKB7",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "While modern deep neural architectures generalise well when test data is sampled from the same distribution as training data, they fail badly for cases when the test data distribution differs from the training distribution even along a few dimensions. This lack of out-of-distribution generalisation is increasingly manifested when the tasks become more abstract and complex, such as in relational reasoning. In this paper we propose a neuroscience-inspired inductive-biased module that can be readily amalgamated with current neural network architectures to improve out-of-distribution (o.o.d) generalisation performance on relational reasoning tasks. This module learns to project high-dimensional object representations to low-dimensional manifolds for more efficient and generalisable relational comparisons. We show that neural nets with this inductive bias achieve considerably better o.o.d generalisation performance for a range of relational reasoning tasks. We finally analyse the proposed inductive bias module to understand the importance of lower dimension projection, and propose an augmentation to the algorithmic alignment theory to better measure algorithmic alignment with generalisation.",
+    "title": "Extrapolatable Relational Reasoning With Comparators in Low-Dimensional Manifolds",
+    "authors": [
+      "Duo Wang",
+      "Mateja Jamnik",
+      "Pietro Li\u00f2"
+    ],
+    "emails": [
+      "~Duo_Wang1",
+      "~Pietro_Li\u00f21",
+      "~Mateja_Jamnik1"
+    ],
+    "rank": 2040
+  },
+  {
+    "url": "https://openreview.net/forum?id=J150Q1eQfJ4",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      4,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We investigate the performance of fully convolutional networks to predict the motion and interaction of surface waves in open and closed complex geometries. We focus on a U-Net type architecture and assess its ability to capture and extrapolate wave propagation in time as well as the reflection, interference and diffraction of waves.  We investigate how well the network generalises both to long-time predictions and to geometric configurations not seen during training. We demonstrate that this neural network is capable of accurately predicting the height distribution of waves on a liquid surface within curved and multi-faceted open and closed geometries, when only simple box and right-angled corner geometries were seen during training. We found that the RMSE of the predictions remained of order <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-msup space=\"3\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mo>\u00d7</mo><msup><mn>10</mn><mrow><mo>\u2212</mo><mn>4</mn></mrow></msup></math></mjx-assistive-mml></mjx-container> times the characteristic length of the domain for at least 20 time-steps.",
+    "title": "Fully Convolutional Approach for Simulating Wave Dynamics",
+    "authors": [
+      "Mario Lino Valencia",
+      "Chris D Cantwell",
+      "Eduardo Pignatelli",
+      "Stathi Fotiadis",
+      "Anil Anthony Bharath"
+    ],
+    "emails": [
+      "~Mario_Lino_Valencia1",
+      "~Chris_D_Cantwell1",
+      "~Anil_Anthony_Bharath2",
+      "~Eduardo_Pignatelli1",
+      "~Stathi_Fotiadis1"
+    ],
+    "rank": 2041
+  },
+  {
+    "url": "https://openreview.net/forum?id=PXedDe28hWH",
+    "ratings": [
+      6,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      1,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Recent decades have witnessed great prosperity of deep learning in tackling various problems such as classification and decision making. The rapid development stimulates a novel framework, Learning-to-Learn (L2L), in which an automatic optimization algorithm (optimizer) modeled by neural networks is expected to learn rules for updating the target objective function (optimizee). Despite its advantages for specific problems, L2L still cannot replace classic methods due to its instability. Unlike hand-engineered algorithms, neural optimizers may suffer from the instability issue---when provided with similar states (a combination of some metrics to describe the optimizee), the same neural optimizer can produce quite different updates. Motivated by the stability property that should be satisfied by an ideal optimizer,  we propose a regularization term that can  enforce the smoothness and stability of the learned neural optimizers. Comprehensive experiments on the neural network training tasks demonstrate that the proposed regularization consistently improve the learned neural optimizers even when transferring to tasks with different architectures and data. Furthermore, we show that our regularizer can improve the performance of neural optimizers on few-shot learning tasks. ",
+    "title": "Learning to Learn with Smooth Regularization",
+    "authors": [
+      "Yuanhao Xiong",
+      "Cho-Jui Hsieh"
+    ],
+    "emails": [
+      "~Cho-Jui_Hsieh1",
+      "~Yuanhao_Xiong1"
+    ],
+    "rank": 2042
+  },
+  {
+    "url": "https://openreview.net/forum?id=s0Chrsstpv2",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Machine learning models are used in many sensitive areas where besides predictive accuracy their comprehensibility is also important. Interpretability of prediction models is necessary to determine their biases and causes of errors, and is a necessary prerequisite for users' confidence. For complex state-of-the-art black-box models post-hoc model-independent explanation techniques are an established solution. Popular and effective techniques, such as IME, LIME, and SHAP, use perturbation of instance features to explain individual predictions.  Recently, Slack et al. (2020) put their robustness into question by showing that their outcomes can be manipulated due to poor perturbation sampling employed. This weakness would allow dieselgate type cheating of owners of sensitive models who could deceive inspection and hide potentially unethical or illegal biases existing in their predictive models. This could undermine public trust in machine learning models and give rise to legal restrictions on their use.  \n\nWe show that better sampling in these explanation methods prevents malicious manipulations. The proposed sampling uses data generators that learn the training set distribution and generate new perturbation instances much more similar to the training set. We show that the improved sampling increases the robustness of the LIME and SHAP, while previously untested method IME is already the most robust of all. ",
+    "title": "Better sampling in explanation methods can prevent dieselgate-like deception",
+    "authors": [
+      "Domen Vre\u0161",
+      "Marko Robnik \u0160ikonja"
+    ],
+    "emails": [
+      "~Domen_Vre\u01611",
+      "fri.uni-lj.si"
+    ],
+    "rank": 2043
+  },
+  {
+    "url": "https://openreview.net/forum?id=bWCWPkZqE1A",
+    "ratings": [
+      4,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.75",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Few-shot learning aims to learn classifiers for new objects from a small number of labeled examples. But it does not do this in a vacuum. Usually, a strong inductive bias is borrowed from the supervised learning of base classes. This inductive bias enables more statistically efficient learning of the new classes. In this work, we show that no labels are needed to develop such an inductive bias, and that self-supervised learning can provide a powerful inductive bias for few-shot learning. This is particularly effective when the unlabeled data for learning such a bias contains not only examples of the base classes, but also examples of the novel classes. The setting in which unlabeled examples of the novel classes are available is known as the transductive setting. Our method outperforms state-of-the-art few-shot learning methods, including other transductive learning methods, by 3.9% for 5-shot accuracy on miniImageNet without using any base class labels. By benchmarking unlabeled-base-class (UBC) few-shot learning and UBC transductive few-shot learning, we demonstrate the great potential of self-supervised feature learning: self-supervision alone is sufficient to create a remarkably good inductive bias for few-shot learning. This motivates a rethinking of whether baseclass labels are necessary at all for few-shot learning. We also explore the relationship between self-supervised features and supervised features, comparing both their transferability and their complementarity in the non-transductive setting. By combining supervised and self-supervised features learned from base classes, we also achieve a new state-of-the-art in the non-transductive setting, outperforming all previous methods.",
+    "title": "SHOT IN THE DARK: FEW-SHOT LEARNING WITH NO BASE-CLASS LABELS",
+    "authors": [
+      "Zitian Chen",
+      "Subhransu Maji",
+      "Erik Learned-Miller"
+    ],
+    "emails": [
+      "~Erik_Learned-Miller2",
+      "~Subhransu_Maji1",
+      "~Zitian_Chen1"
+    ],
+    "rank": 2044
+  },
+  {
+    "url": "https://openreview.net/forum?id=FUtMxDTJ_h",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "This paper continues the quest for designing the optimal physics bias for neural networks predicting the dynamics of systems when the underlying dynamics shall be inferred from the data directly. The description of physical systems is greatly simplified when the underlying symmetries of the system are taken into account. In classical systems described via Hamiltonian dynamics this is achieved by using appropriate coordinates, so-called cyclic coordinates, which reveal conserved quantities directly. Without changing the Hamiltonian, these coordinates can be obtained via canonical transformations. We show that such coordinates can be searched for automatically with appropriate loss functions which naturally arise from Hamiltonian dynamics. As a proof of principle, we test our method on standard classical physics systems using synthetic and experimental data where our network identifies the conserved quantities in an unsupervised way and find improved performance on predicting the dynamics of the system compared to networks biasing just to the Hamiltonian. Effectively, these new coordinates guarantee that motion takes place on symmetry orbits in phase space, i.e.~appropriate lower dimensional sub-spaces of phase space. By fitting analytic formulae we recover that our networks are utilising conserved quantities such as (angular) momentum.",
+    "title": "Symmetry Control Neural Networks",
+    "authors": [
+      "Marc Syvaeri",
+      "Sven Krippendorf"
+    ],
+    "emails": [
+      "~Marc_Syvaeri1",
+      "~Sven_Krippendorf1"
+    ],
+    "rank": 2045
+  },
+  {
+    "url": "https://openreview.net/forum?id=d2m6yCwyJW",
+    "ratings": [
+      5,
+      4,
+      5,
+      3,
+      7
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Randomized value function has been shown as an effective exploration strategy for reinforcement learning (RL), which samples from a learned estimation of the distribution over the randomized Q-value function and then selects the optimal action. However, value function methods are known to suffer from value estimation error. Overfitting of value function is one of the main reasons to estimation error. To address this, in this paper, we propose a Bayesian linear regression with informative prior (IP-BLR) operator to leverage the data-dependent prior in the learning process of randomized value function, which can leverage the statistics of training results from previous iterations. We theoretically derive a generalization error bound for the proposed IP-BLR operation at each learning iteration based on PAC-Bayesian theory, showing a trade-off between the distribution obtained by IP-BLR and the informative prior. Since the optimal posterior that minimizes this generalization error bound is intractable, we alternatively develop an adaptive noise parameter update algorithm to balance this trade-off. The performance of the proposed IP-BLR deep Q-network (DQN) with adaptive noise parameter update is validated through some classical control tasks. It demonstrates that compared to existing methods using non-informative prior, the proposed IP-BLR DQN can achieve higher accumulated rewards in fewer interactions with the environment, due to the capabilities of more accurate value function approximation and better generalization.",
+    "title": "PAC-Bayesian Randomized Value Function with Informative Prior",
+    "authors": [
+      "Yuankun Jiang",
+      "Chenglin Li",
+      "Junni Zou",
+      "Wenrui Dai",
+      "Hongkai Xiong"
+    ],
+    "emails": [
+      "~Chenglin_Li2",
+      "~Hongkai_Xiong1",
+      "~Wenrui_Dai1",
+      "~Junni_Zou1",
+      "~Yuankun_Jiang1"
+    ],
+    "rank": 2046
+  },
+  {
+    "url": "https://openreview.net/forum?id=V2v7QcVkbhH",
+    "ratings": [
+      5,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Conventional image classifiers are trained by randomly sampling mini-batches of images.  To achieve state-of-the-art performance, sophisticated data augmentation schemes are used to expand the amount of training data available for sampling.  In contrast, meta-learning algorithms sample not only images, but classes as well.  We investigate how data augmentation can be used not only to expand the number of images available per class, but also to generate entirely new classes.  We systematically dissect the meta-learning pipeline and investigate the distinct ways in which data augmentation can be integrated at both the image and class levels.  Our proposed meta-specific data augmentation significantly improves the performance of meta-learners on few-shot classification benchmarks.",
+    "title": "Data Augmentation for Meta-Learning",
+    "authors": [
+      "Renkun Ni",
+      "Micah Goldblum",
+      "Amr Sharaf",
+      "Kezhi Kong",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Amr_Sharaf1",
+      "~Kezhi_Kong1",
+      "~Renkun_Ni1",
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1"
+    ],
+    "rank": 2047
+  },
+  {
+    "url": "https://openreview.net/forum?id=daLIpc7vQ2q",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.74",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "We propose several different techniques to improve contrastive divergence training of energy-based models (EBMs). We first show that a gradient term neglected in the popular contrastive divergence formulation is both tractable to estimate and is important to avoid training instabilities in previous models. We further highlight how data augmentation, multi-scale processing, and reservoir sampling can be used to improve model robustness and generation quality. Thirdly, we empirically evaluate stability of model architectures and show improved performance on a host of benchmarks and use cases, such as image generation, OOD detection, and compositional generation.",
+    "title": "Improved Contrastive Divergence Training of Energy Based Models",
+    "authors": [
+      "Yilun Du",
+      "Shuang Li",
+      "Joshua B. Tenenbaum",
+      "Igor Mordatch"
+    ],
+    "emails": [
+      "~Joshua_B._Tenenbaum1",
+      "~Igor_Mordatch4",
+      "~Shuang_Li5",
+      "~Yilun_Du1"
+    ],
+    "rank": 2048
+  },
+  {
+    "url": "https://openreview.net/forum?id=k9GoaycDeio",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.74",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Despite its increasing popularity, deep neural networks are easily fooled. To alleviate this deficiency, researchers are actively developing new training strategies, which encourage models that are robust to small input perturbations. Several successful robust training methods have been proposed. However, many of them rely on strong adversaries, which can be prohibitively expensive to generate when the input dimension is high and the model structure is complicated. We adopt a new perspective on robustness and propose a novel training algorithm that allows a more effective use of adversaries. Our method improves the model robustness at each local ball centered around an adversary and then, by combining these local balls through a global term, achieves overall robustness. We demonstrate that, by maximizing the use of adversaries via focusing on local balls, we achieve high robust accuracy with weak adversaries. Specifically, our method reaches a similar robust accuracy level to the state of the art approaches trained on strong adversaries on MNIST, CIFAR-10 and CIFAR-100. As a result, the overall training time is reduced. Furthermore, when trained with strong adversaries, our method matches with the current state of the art on MNIST and outperforms them on CIFAR-10 and CIFAR-100.",
+    "title": "Improving Local Effectiveness for Global Robustness Training",
+    "authors": [
+      "JINGYUE LU",
+      "M. Pawan Kumar"
+    ],
+    "emails": [
+      "~JINGYUE_LU1",
+      "~M._Pawan_Kumar1"
+    ],
+    "rank": 2049
+  },
+  {
+    "url": "https://openreview.net/forum?id=6VPl9khIMz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.74",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "We study Graph Convolutional Networks (GCN) from the graph signal processing viewpoint by addressing a difference between learning graph filters with fully-connected weights versus trainable polynomial coefficients. We find that by stacking graph filters with learnable polynomial parameters, we can build a highly adaptive and robust vertex classification model. Our treatment here relaxes the low-frequency (or equivalently, high homophily) assumptions in existing vertex classification models, resulting a more ubiquitous solution in terms of spectral properties. Empirically, by using only one hyper-parameter setting, our model achieves strong results on most benchmark datasets across the frequency spectrum.",
+    "title": "Adaptive Stacked Graph Filter",
+    "authors": [
+      "Hoang NT",
+      "Takanori Maehara",
+      "Tsuyoshi Murata"
+    ],
+    "emails": [
+      "~Tsuyoshi_Murata1",
+      "~Takanori_Maehara1",
+      "~Hoang_NT1"
+    ],
+    "rank": 2050
+  },
+  {
+    "url": "https://openreview.net/forum?id=rF2kJgYAr3",
+    "ratings": [
+      3,
+      5,
+      6,
+      5
+    ],
+    "rating": "4.74",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Application domains that require considering relationships among objects which have real-valued attributes are becoming even more important. In this paper we propose NeuralLog, a first-order logic language that is compiled to a neural network. The main goal of NeuralLog is to bridge logic programming and deep learning, allowing advances in both fields to be combined in order to obtain better machine learning models. The main advantages of NeuralLog are: to allow neural networks to be defined as logic programs; and to be able to handle numeric attributes and functions. We compared NeuralLog with two distinct systems that use first-order logic to build neural networks. We have also shown that NeuralLog can learn link prediction and classification tasks, using the same theory as the compared systems, achieving better results for the area under the ROC curve in four datasets: Cora and UWCSE for link prediction; and Yelp and PAKDD15 for classification; and comparable results for link prediction in the WordNet dataset.",
+    "title": "NeuralLog: a Neural Logic Language",
+    "authors": [
+      "Victor Guimar\u00e3es",
+      "Vitor Santos Costa"
+    ],
+    "emails": [
+      "~Vitor_Santos_Costa1",
+      "~Victor_Guimar\u00e3es1"
+    ],
+    "rank": 2051
+  },
+  {
+    "url": "https://openreview.net/forum?id=3FK30d5BZdu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Decisions made by machine learning systems have increasing influence on the world, yet it is common for machine learning algorithms to assume that no such influence exists.  An example is the use of the i.i.d. assumption in content recommendation.  In fact, the (choice of) content displayed can change users' perceptions and preferences, or even drive them away, causing a shift in the distribution of users.  We introduce the term auto-induced distributional shift (ADS) to describe the phenomenon of an algorithm causing a change in the distribution of its own inputs.  Our goal is to ensure that machine learning systems do not leverage ADS to increase performance when doing so could be undesirable.  We demonstrate that changes to the learning algorithm, such as the introduction of meta-learning, can cause hidden incentives for auto-induced distributional shift (HI-ADS) to be revealed.  To address this issue, we introduce `unit tests' and a mitigation strategy for HI-ADS, as well as a toy environment for modelling real-world issues with HI-ADS in content recommendation, where we demonstrate that strong meta-learners achieve gains in performance via ADS.  We show meta-learning and Q-learning both sometimes fail unit tests, but pass when using our mitigation strategy.",
+    "title": "Hidden Incentives for Auto-Induced Distributional Shift",
+    "authors": [
+      "David Krueger",
+      "Tegan Maharaj",
+      "Jan Leike"
+    ],
+    "emails": [
+      "~David_Krueger1",
+      "~Jan_Leike1",
+      "~Tegan_Maharaj1"
+    ],
+    "rank": 2052
+  },
+  {
+    "url": "https://openreview.net/forum?id=uHjLW-0tsCu",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we propose a low-bit training framework for convolutional neural networks. Our framework focuses on reducing the energy and time consumption of convolution kernels, by quantizing all the convolutional operands (activation, weight, and error) to low bit-width.  Specifically, we propose a multi-level scaling (MLS) tensor format,  in which the element-wise bit-width can be largely reduced to simplify floating-point computations to nearly fixed-point. Then, we describe the dynamic quantization and the low-bit tensor convolution arithmetic to efficiently leverage the MLS tensor format. Experiments show that our framework achieves a superior trade-off between the accuracy and the bit-width than previous methods. When training ResNet-20 on CIFAR-10, \nall convolution operands can be quantized to 1-bit mantissa and 2-bit exponent, while retaining the same accuracy as the full-precision training. When training ResNet-18 on ImageNet, with 4-bit mantissa and 2-bit exponent, our framework can achieve an accuracy loss of less than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>. Energy consumption analysis shows that our design can achieve over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>6.8</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> higher energy efficiency than training with floating-point arithmetic.",
+    "title": "Exploring  the Potential of Low-Bit Training of Convolutional Neural Networks",
+    "authors": [
+      "Kai Zhong",
+      "Xuefei Ning",
+      "Tianchen Zhao",
+      "Zhenhua Zhu",
+      "Shulin Zeng",
+      "Guohao Dai",
+      "Yu Wang",
+      "Huazhong Yang"
+    ],
+    "emails": [
+      "~Tianchen_Zhao2",
+      "~Kai_Zhong2",
+      "mails.tsinghua.edu.cn",
+      "~Huazhong_Yang1",
+      "~Yu_Wang3",
+      "mail.tsinghua.edu.cn",
+      "~Xuefei_Ning1"
+    ],
+    "rank": 2053
+  },
+  {
+    "url": "https://openreview.net/forum?id=9D_Ovq4Mgho",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      3
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Conventional transfer learning leverages weights of pre-trained networks, but mandates the need for similar neural architectures. Alternatively, knowledge distillation can transfer knowledge between heterogeneous networks but often requires access to the original training data or additional generative networks. Knowledge transfer between networks can be improved by being agnostic to the choice of network architecture and reducing the dependence on original training data. We propose a knowledge transfer approach from a teacher to a student network wherein we train the student on an independent transferal dataset, whose annotations are generated by the teacher. Experiments were conducted on five state-of-the-art networks for semantic segmentation and seven datasets across three imaging modalities. We studied knowledge transfer from a single teacher, combination of knowledge transfer and fine-tuning, and knowledge transfer from multiple teachers. The student model with a single teacher achieved similar performance as the teacher; and  the student model with multiple teachers achieved better performance than the teachers. The salient features of our algorithm include: 1) no need for original training data or generative networks, 2) knowledge transfer between different architectures, 3) ease of implementation for downstream tasks by using the downstream task dataset as the transferal dataset, 4) knowledge transfer of an ensemble of models, trained independently, into one student model. Extensive experiments demonstrate that the proposed algorithm is effective for knowledge transfer and easily tunable. ",
+    "title": "Network-Agnostic Knowledge Transfer for Medical Image Segmentation",
+    "authors": [
+      "Shuhang Wang",
+      "Eugene Cheah",
+      "Elham Yousef Kalafi",
+      "Mercy Asiedu",
+      "Alex Benjamin",
+      "Vivek Kumar Singh",
+      "Ge Zhang",
+      "Viksit Kumar",
+      "Anthony Edward Samir"
+    ],
+    "emails": [
+      "~Shuhang_Wang1",
+      "~Anthony_Edward_Samir1",
+      "mgh.harvard.edu"
+    ],
+    "rank": 2054
+  },
+  {
+    "url": "https://openreview.net/forum?id=65sCF5wmhpv",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "We consider a decision making problem where an autonomous agent decides on which actions to take based on the observations it collects from the environment. We are interested in revealing the information structure of the observation space illustrating which type of observations are the most important (such as position versus velocity) and the dependence of this on the state of agent (such as at the bottom versus top of a hill). We approach this problem by associating a cost with collecting observations which increases with the accuracy. We adopt a reinforcement learning (RL) framework where the RL agent learns to adjust the accuracy of the observations alongside learning to perform the original task. We consider both the scenario where the accuracy can be adjusted continuously and also the scenario where the agent has to choose between given preset levels, such as taking a sample perfectly or not taking a sample at all.   In contrast to the existing work that mostly focuses on sample efficiency during training, our focus is on the behaviour during the actual task. Our results illustrate that the RL agent can  learn to use the observation space efficiently and obtain satisfactory performance in the original task while collecting effectively smaller amount of data. By uncovering the relative usefulness of different types of observations and trade-offs within, these results also provide insights for further design of active data acquisition schemes. ",
+    "title": "Learning to Observe with Reinforcement Learning",
+    "authors": [
+      "Mehmet Koseoglu",
+      "Ece Kunduracioglu",
+      "Ayca Ozcelikkale"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Ayca_Ozcelikkale1",
+      "~Mehmet_Koseoglu1"
+    ],
+    "rank": 2055
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ef1nNHQHZ20",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Deep neural networks are observed to be fragile against adversarial attacks, which have dramatically limited their practical applicability. On improving model robustness, the adversarial training techniques have proven effective and gained increasing attention from research communities. Existing adversarial training approaches mainly focus on perturbations to inputs, while the effect of the perturbations in hidden layers remains underexplored. In this work, we propose layer-wise adversarial defense which improves adversarial training by a noticeable margin. The basic idea of our method is to strengthen all of the hidden layers with perturbations that are proportional to the back-propagated gradients. In order to study the layer-wise neural dynamics, we formulate our approach from the perspective of ordinary differential equations (ODEs) and build up its extended relationship with conventional adversarial training methods, which tightens the relationship between neural networks and ODEs. In the implementation, we propose two different training algorithms by discretizing the ODE model with the Lie-Trotter and the Strang-Marchuk splitting schemes from the operator-splitting theory. Experiments on CIFAR-10 and CIFAR-100 benchmarks show that our methods consistently improve adversarial model robustness on top of widely-used strong adversarial training techniques. ",
+    "title": "Layer-wise Adversarial Defense: An ODE Perspective",
+    "authors": [
+      "Zonghan Yang",
+      "Yang Liu",
+      "Chenglong Bao",
+      "Zuoqiang Shi"
+    ],
+    "emails": [
+      "~Chenglong_Bao3",
+      "tsinghua.edu.cn",
+      "~Zuoqiang_Shi1",
+      "~Zonghan_Yang1"
+    ],
+    "rank": 2056
+  },
+  {
+    "url": "https://openreview.net/forum?id=yEnaS6yOkxy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Generative Adversarial Networks (GANs) have swiftly evolved to imitate increasingly complex image distributions. However, majority of the developments focus on performance of GANs on balanced datasets. We find that the existing GANs and their training regimes which work well on balanced datasets fail to be effective in case of imbalanced (i.e. long-tailed) datasets. In this work we introduce a novel and theoretically motivated Class Balancing regularizer for training GANs. Our regularizer makes use of the knowledge from a pre-trained classifier to ensure balanced learning of all the classes in the dataset. This is achieved via modelling the effective class frequency based on the exponential forgetting observed in neural networks and encouraging the GAN to focus on underrepresented classes. We demonstrate the utility of our contribution in two diverse scenarios: (i) Learning representations for long-tailed distributions, where we achieve better performance than existing approaches, and (ii) Generation of Universal Adversarial Perturbations (UAPs) in the data-free scenario for the large scale datasets, where we bridge the gap between data-driven and data-free approaches for crafting UAPs.",
+    "title": "Class Balancing GAN with a Classifier in the Loop",
+    "authors": [
+      "Harsh Rangwani",
+      "Konda Reddy Mopuri",
+      "Venkatesh Babu Radhakrishnan"
+    ],
+    "emails": [
+      "~Konda_Reddy_Mopuri3",
+      "~Harsh_Rangwani1",
+      "~Venkatesh_Babu_Radhakrishnan2"
+    ],
+    "rank": 2057
+  },
+  {
+    "url": "https://openreview.net/forum?id=iy3xVojOhV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Graph convolutional networks (GCN) achieved superior performances in graph-based semi-supervised learning (SSL) tasks. \nGenerative adversarial networks (GAN) also show the ability to increase the performance in SSL.\nHowever, there is still no good way to combine the GAN and GCN in graph-based SSL tasks.\nIn this work, we present GraphCGAN, a novel framework to incorporate adversarial learning with convolution-based graph neural networks, to operate on graph-structured data. \nIn GraphCGAN, we show that generator can generate topology structure and features of fake nodes jointly and boost the performance of convolution-based graph neural networks classifier.\nIn a number of experiments on benchmark datasets, we show that the proposed GraphCGAN outperforms the baseline methods by a significant margin. ",
+    "title": "GraphCGAN: Convolutional Graph Neural Network with Generative Adversarial Networks",
+    "authors": [
+      "Sheng Zhang",
+      "Rui Song",
+      "Wenbin Lu"
+    ],
+    "emails": [
+      "~Sheng_Zhang8",
+      "~Wenbin_Lu1",
+      "~Rui_Song2"
+    ],
+    "rank": 2058
+  },
+  {
+    "url": "https://openreview.net/forum?id=amRmtfpYgDt",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Goal-oriented reinforcement learning algorithms are often good at exploration, not exploitation, while episodic algorithms excel at exploitation, not exploration. As a result, neither of these approaches alone can lead to a sample efficient algorithm in complex environments with high dimensional state space and delayed rewards. Motivated by these observations and shortcomings, in this paper, we introduce Regioned Episodic Reinforcement Learning (RERL) that combines the episodic and goal-oriented learning strengths and leads to a more sample efficient and ef- fective algorithm. RERL achieves this by decomposing the space into several sub-space regions and constructing regions that lead to more effective exploration and high values trajectories. Extensive experiments on various benchmark tasks show that RERL outperforms existing methods in terms of sample efficiency and final rewards.",
+    "title": "Regioned Episodic Reinforcement Learning",
+    "authors": [
+      "Jiarui Jin",
+      "Cong Chen",
+      "Ming Zhou",
+      "Weinan Zhang",
+      "Rasool Fakoor",
+      "David Wipf",
+      "Yong Yu",
+      "Jun Wang",
+      "Alex Smola"
+    ],
+    "emails": [
+      "sjtu.edu.cn",
+      "~Alex_Smola1",
+      "~Rasool_Fakoor1",
+      "~Jiarui_Jin1",
+      "~Weinan_Zhang1",
+      "~Yong_Yu1",
+      "~David_Wipf1",
+      "~Jun_Wang2",
+      "~Ming_Zhou2"
+    ],
+    "rank": 2059
+  },
+  {
+    "url": "https://openreview.net/forum?id=Atpv9GUhRt6",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      2,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Neural networks have become the standard for image classification tasks. On one hand, convolutional neural networks (CNNs) achieve state-of-the-art performance by learning from a regular grid representation of images. On the other hand, graph neural networks (GNNs) have shown promise in learning image classification from an embedded superpixel graph. However, in the latter, studies have been restricted to SLIC superpixels, where 1) a single target number of superpixels is arbitrarily defined for an entire dataset irrespective of differences across images and 2) the superpixels in a given image are of similar size despite intrinsic multiscale structure. In this study, we investigate learning from a new principled representation in which individual images are represented by an image-specific number of multiscale superpixels. We propose WaveMesh, a wavelet-based superpixeling algorithm, where the number and sizes of superpixels in an image are systematically computed based on the image content. We also present WavePool, a spatially heterogeneous pooling scheme tailored to WaveMesh superpixels. We study the feasibility of learning from the WaveMesh superpixel representation using SplineCNN, a state-of-the-art network for image graph classification. We show that under the same network architecture and training settings, SplineCNN with original Graclus-based pooling learns from WaveMesh superpixels on-par with SLIC superpixels. Additionally, we observe that the best performance is achieved when replacing Graclus-based pooling with WavePool while using WaveMesh superpixels.",
+    "title": "Learning from multiscale wavelet superpixels using GNN with spatially heterogeneous pooling",
+    "authors": [
+      "Maxime Bassenne",
+      "Varun Vasudevan",
+      "Lei Xing"
+    ],
+    "emails": [
+      "~Lei_Xing1",
+      "stanford.edu",
+      "~Maxime_Bassenne1"
+    ],
+    "rank": 2060
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q8ZdJahesWe",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study the problem of protecting information when learning with graph-structured data. While the advent of Graph Neural Networks (GNNs) has greatly improved node and graph representational learning in many applications, the neighborhood aggregation paradigm exposes additional vulnerabilities to attackers seeking to extract node-level information about sensitive attributes. To counter this, we propose a minimax game between the desired GNN encoder and the worst-case attacker. The resulting adversarial training creates a strong defense against inference attacks, while only suffering small loss in task performance. We analyze the effectiveness of our framework against a worst-case adversary, and characterize the trade-off between predictive accuracy and adversarial defense. Experiments across multiple datasets from recommender systems, knowledge graphs and quantum chemistry demonstrate that the proposed approach provides a robust defense across various graph structures and tasks, while producing competitive GNN encoders.",
+    "title": "Graph Adversarial Networks: Protecting Information against Adversarial Attacks",
+    "authors": [
+      "Peiyuan Liao",
+      "Han Zhao",
+      "Keyulu Xu",
+      "Tommi S. Jaakkola",
+      "Geoff Gordon",
+      "Stefanie Jegelka",
+      "Ruslan Salakhutdinov"
+    ],
+    "emails": [
+      "~Peiyuan_Liao1",
+      "~Stefanie_Jegelka3",
+      "~Ruslan_Salakhutdinov1",
+      "~Keyulu_Xu1",
+      "~Han_Zhao1",
+      "~Geoff_Gordon2",
+      "~Tommi_S._Jaakkola1"
+    ],
+    "rank": 2061
+  },
+  {
+    "url": "https://openreview.net/forum?id=PkqwRo2wjuW",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "We target the problem of proving the semantic equivalence between two complex expressions represented as typed trees, and demonstrate our system on expressions from a rich multi-type symbolic language for linear algebra. We propose the first graph-to-sequence deep learning system to generate axiomatic proofs of equivalence between program pairs. We generate expressions which include scalars, vectors and matrices and 16 distinct operators combining them, with 147 distinct axioms of equivalence. We study the robustness of the system to generate proofs of increasing length, demonstrating how incremental graph-to-sequence networks can learn to represent complex and verifiable symbolic reasoning. It achieves 93% average true positive coverage on 10,000 test cases while ensuring zero false positives by design.",
+    "title": "Learning Axioms to Compute Verifiable Symbolic Expression Equivalence Proofs Using Graph-to-Sequence Networks",
+    "authors": [
+      "Steven James Kommrusch",
+      "Louis-Noel Pouchet",
+      "Theo Barolett"
+    ],
+    "emails": [
+      "~Louis-Noel_Pouchet2",
+      "~Steven_James_Kommrusch1",
+      "inria.fr"
+    ],
+    "rank": 2062
+  },
+  {
+    "url": "https://openreview.net/forum?id=XRo78JEfVnt",
+    "ratings": [
+      4,
+      4,
+      6
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Every length-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mi>n</mi><mo>+</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> sequence of orthogonal polynomials is uniquely represented by two length-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mi>n</mi><mo>+</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> sequences of coefficients <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>. We make this representation learnable by gradient-based methods. Orthogonal polynomial operations may be automatically differentiated, but this uses <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>n</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> memory and is very slow in practice. By exploiting reversibility, we derive a differentiation algorithm which uses <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> memory and is much faster in practice. Using this algorithm, fixed polynomial transforms (e.g. discrete cosine transforms) can be replaced by learnable layers. These are more expressive, but they retain the computational efficiency and analytic tractability of orthogonal polynomials.\n\nAs another application, we present an algorithm for approximating the minimal value <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi><mo stretchy=\"false\">(</mo><msup><mi>w</mi><mo>\u2217</mo></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> of a general nonconvex objective <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>, without finding the minimizer <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D464 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mo class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c2217\"></mjx-c></mjx-mo></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>w</mi><mo>\u2217</mo></msup></math></mjx-assistive-mml></mjx-container>. It follows a scheme recently proposed by Lasserre (2020), whose core algorithmic problem is to find the sequence of polynomials orthogonal to a given probability distribution. Despite the general intractability of this problem, we observe encouraging initial results on some test cases.",
+    "title": "Optimizing Over All Sequences of Orthogonal Polynomials",
+    "authors": [
+      "Shiva Kaul"
+    ],
+    "emails": [
+      "~Shiva_Kaul1"
+    ],
+    "rank": 2063
+  },
+  {
+    "url": "https://openreview.net/forum?id=bFeSb705ekd",
+    "ratings": [
+      5,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Recent studies have unveiled the vulnerabilities of deep ranking models, where\nan imperceptible perturbation could trigger dramatic changes in the ranking\nresult.  However, previous attempts focus on manipulating absolute ranks of\ncertain candidates, while the possibility of adjusting their relative order\nremains under-explored.  The objective of this paper is to formalize and\npractically implement a new adversarial attack against deep ranking systems,\ni.e., the Order Attack, which covertly alters the relative order of a selected\nset of candidates according to a permutation vector predefined by the attacker,\nwith only limited interference to other unrelated candidates.  Although this\nOrder Attack can be formulated as a triplet-style loss constraint imposing an\ninequality chain that reflects the attacker's desired permutation, direct\noptimization of such loss is inapplicable in a real-world black-box attack\nscenario due to the inaccessibility of gradients, limited query budget,\ntruncated ranking results, and lack of similarity scores.  To address these\nchallenges, we propose a new Short-range Ranking Correlation metric as a\nsurrogate objective function to approximate Kendall's ranking correlation while\nmaintaining robustness to these practical limitations.  The proposed white-box\nand black-box attacks are evaluated on the Fashion-MNIST and\nStanford-Online-Products datasets. Moreover, the black-box attack is\nsuccessfully implemented on a major e-commerce platform.  Extensive\nquantitative and qualitative experimental evaluations demonstrate the\neffectiveness of our proposed methods, revealing deep ranking systems'\nvulnerability to the Order Attack. \n",
+    "title": "Practical Order Attack in Deep Ranking",
+    "authors": [
+      "Mo Zhou",
+      "Le Wang",
+      "Zhenxing Niu",
+      "Qilin Zhang",
+      "Xu Yinghui",
+      "Nanning Zheng",
+      "Gang Hua"
+    ],
+    "emails": [
+      "~Mo_Zhou1",
+      "~Nanning_Zheng1",
+      "~Le_Wang1",
+      "~Zhenxing_Niu1",
+      "~Xu_Yinghui1",
+      "~Qilin_Zhang2",
+      "~Gang_Hua3"
+    ],
+    "rank": 2064
+  },
+  {
+    "url": "https://openreview.net/forum?id=E_U8Zvx7zrf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The heavy communication for model synchronization is a major bottleneck for scaling up the distributed deep neural network training to many workers. Moreover, model synchronization can suffer from long delays in scenarios such as federated learning and geo-distributed training. Thus, it is crucial that the distributed training methods are both \\textit{delay-tolerant} AND \\textit{communication-efficient}. However, existing works cannot simultaneously address the communication delay and bandwidth constraint. To address this important and challenging problem, we propose a novel training framework OLCO\\textsubscript{3} to achieve delay tolerance with a low communication budget by using stale information. OLCO\\textsubscript{3} introduces novel staleness compensation and compression compensation to combat the influence of staleness and compression error. Theoretical analysis shows that OLCO\\textsubscript{3} achieves the same sub-linear convergence rate as the vanilla synchronous stochastic gradient descent (SGD) method. Extensive experiments on deep learning tasks verify the effectiveness of OLCO\\textsubscript{3} and its advantages over existing works.",
+    "title": "Delay-Tolerant Local SGD for Efficient Distributed Training",
+    "authors": [
+      "An Xu",
+      "Xiao Yan",
+      "Hongchang Gao",
+      "Heng Huang"
+    ],
+    "emails": [
+      "~Xiao_Yan2",
+      "~An_Xu1",
+      "~Heng_Huang1",
+      "~Hongchang_Gao1"
+    ],
+    "rank": 2065
+  },
+  {
+    "url": "https://openreview.net/forum?id=JmnFvgMSjgg",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Conditional generative adversarial networks (cGANs) play an important role in multimodal image-to-image translation. We propose Diversity Augmented conditional Generative Adversarial Network (DivAugGAN), a highly effective solution to further resolve the mode collapse problem and enhance the diversity for the generated images. DivAugGAN functions as a regularizer to maximize the distinction of the generating samples when different noise vectors are injected. We also exert extra constraint on the generator to ensure the relative variation consistency in the translation process. This guarantees that the changing scale of the generated images in the image space is coherent to the difference of the injected noise vectors in the latent space. It also reduces the chances to bring about unexpected mode override and mode fusion issues. Experimental results on both two-domain and multi-domain multimodal image-to-image translation tasks demonstrate its effectiveness. DivAugGAN leads to consistent diversity augmentations and visual quality improvements for the developed models. We also achieves state-of-the-art performances on multiple datasets in terms of widely used quantitative evaluation metrics. DivAugGAN can be easily integrated into any objectives in conditional generative models as a regularizer for diversity augmentations and quality enhancements without any additional computation overheads compromise The source code and pre-trained models of our method is available at <a href=\"https://github.com/anomymous-gan/DivAugGAN\" target=\"_blank\" rel=\"nofollow\">https://github.com/anomymous-gan/DivAugGAN</a>. ",
+    "title": "Diversity Augmented Conditional Generative Adversarial Network for Enhanced Multimodal Image-to-Image Translation",
+    "authors": [
+      "Yunlong MENG",
+      "Lin Xu"
+    ],
+    "emails": [
+      "~Yunlong_MENG1",
+      "~Lin_Xu2"
+    ],
+    "rank": 2066
+  },
+  {
+    "url": "https://openreview.net/forum?id=qoTcTS9-IZ-",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Recent research has been focused on two different approaches to studying neural networks training in the limit of infinite width (1) a mean-field (MF) and (2) a constant neural tangent kernel (NTK) approximations. These two approaches have different scaling of hyperparameters with the width of a network layer and as a result, different infinite-width limit models. Restricting ourselves to single hidden layer nets with zero-mean initialization trained for binary classification with SGD, we propose a general framework to study how the limit behavior of neural models depends on the scaling of hyperparameters with network width. Our framework allows us to derive scaling for existing MF and NTK limits, as well as an uncountable number of other scalings that lead to a dynamically stable limit behavior of corresponding models. However, only a finite number of distinct limit models are induced by these scalings. Each distinct limit model corresponds to a unique combination of such properties as boundedness of logits and tangent kernels at initialization or stationarity of tangent kernels. Existing MF and NTK limit models, as well as one novel limit model, satisfy most of the properties demonstrated by finite-width models. We also propose a novel initialization-corrected mean-field limit that satisfies all properties noted above, and its corresponding model is a simple modification for a finite-width model.",
+    "title": "Dynamically Stable Infinite-Width Limits of Neural Classifiers",
+    "authors": [
+      "Eugene Golikov"
+    ],
+    "emails": [
+      "~Eugene_Golikov1"
+    ],
+    "rank": 2067
+  },
+  {
+    "url": "https://openreview.net/forum?id=3Jf4Fr2I4T2",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.73",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Bayesian optimization is a class of global optimization techniques. In Bayesian optimization, the underlying objective function is modeled as a realization of a Gaussian process. Although the Gaussian process assumption implies a random distribution of the Bayesian optimization outputs, quantification of this uncertainty is rarely studied in the literature. In this work, we propose a novel approach to assess the output uncertainty of Bayesian optimization algorithms, which proceeds by constructing confidence regions of the maximum point (or value) of the objective function. These regions can be computed efficiently, and their confidence levels are guaranteed by the uniform error bounds for sequential Gaussian process regression newly developed in the present work. Our theory provides a unified uncertainty quantification framework for all existing sequential sampling policies and stopping criteria.",
+    "title": "Uncertainty Quantification for Bayesian Optimization",
+    "authors": [
+      "Rui Tuo",
+      "Wenjia Wang"
+    ],
+    "emails": [
+      "~Rui_Tuo1",
+      "~Wenjia_Wang2"
+    ],
+    "rank": 2068
+  },
+  {
+    "url": "https://openreview.net/forum?id=WqXAKcwfZtI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.72",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "The problem of unsupervised domain adaptation arises in a variety of practical applications where the distribution of the training samples differs from those used at test time. The existing theory of domain adaptation derived generalization bounds based on divergence measures that are hard to optimize in practice. This has led to a large disconnect between theory and state-of-the-art methods. In this paper, we propose a novel domain-adversarial framework that introduces new theory for domain adaptation and leads to practical learning algorithms with neural networks. In particular, we derive a novel generalization bound that utilizes a new measure of discrepancy between distributions based on a variational characterization of f-divergences. We show that our bound recovers the theoretical results from Ben-David et al. (2010a) as a special case with a particular choice of divergence, and also supports divergences typically used in practice. We derive a general algorithm for domain-adversarial learning for the complete family of f-divergences. We provide empirical results for several f-divergences and show that some, not considered previously in domain-adversarial learning, achieve state-of-the-art results in practice. We provide empirical insights into how choosing a particular divergence affects the transfer performance on real-world datasets. By further recognizing the optimization problem as a Stackelberg game, we utilize the latest optimizers from the game optimization literature, achieving additional performance boosts in our training algorithm. We show that our f-domain adversarial framework achieves state-of-the-art results on the challenging Office-31 and Office-Home datasets without extra hyperparameters.",
+    "title": "f-Domain-Adversarial Learning: Theory and Algorithms for Unsupervised Domain Adaptation with Neural Networks",
+    "authors": [
+      "David Acuna",
+      "Guojun Zhang",
+      "Marc T Law",
+      "Sanja Fidler"
+    ],
+    "emails": [
+      "~Sanja_Fidler1",
+      "~Marc_T_Law1",
+      "~Guojun_Zhang1",
+      "~David_Acuna1"
+    ],
+    "rank": 2069
+  },
+  {
+    "url": "https://openreview.net/forum?id=SeFiP8YAJy",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.72",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Recent research on compressing deep neural networks has focused on reducing the number of parameters. Smaller networks are easier to export and deploy on edge-devices. We introduce Adjoined networks as a training approach that can regularize and compress any CNN-based neural architecture. Our one-shot learning paradigm trains both the original and the smaller networks together. The parameters of the smaller network are shared across both the architectures. We prove strong theoretical guarantees on the regularization behavior of the adjoint training paradigm. We complement our theoretical analysis by an extensive empirical evaluation of both the compression and regularization behavior of adjoint networks. For resnet-50 trained adjointly on Imagenet, we are able to achieve a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>13.7</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> reduction in the number of parameters and a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> improvement in inference time without any significant drop in accuracy. For the same architecture on CIFAR-100, we are able to achieve a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>99.7</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> reduction in the number of parameters and a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>5</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> improvement in inference time. On both these datasets, the original network trained in the adjoint fashion gains about <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> in top-1 accuracy as compared to the same network trained in the standard fashion. ",
+    "title": "Better Together: Resnet-50 accuracy with 13\u00d7 fewer parameters and at 3\u00d7 speed",
+    "authors": [
+      "Utkarsh Nath",
+      "Shrinu Kushagra"
+    ],
+    "emails": [
+      "nyu.edu",
+      "~Shrinu_Kushagra1"
+    ],
+    "rank": 2070
+  },
+  {
+    "url": "https://openreview.net/forum?id=AhgxLhJvbpS",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.72",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Weight pruning is an effective technique to reduce the model size and inference time for deep neural networks in real-world deployments. However, since magnitudes and relative importance of weights are very different for different layers of a neural network, existing methods rely on either manual tuning or handcrafted heuristic rules to find appropriate pruning rates individually for each layer. This approach general leads to suboptimal performance.  In this paper, by directly working on the probability space, we propose an effective network sparsification method called {\\it probabilistic masking} (ProbMask), which solves a natural sparsification formulation under global sparsity constraint. The key idea is to use probability as a global criterion for all layers to measure the weight importance. An appealing feature of ProbMask is that the amounts of weight redundancy can be learned automatically via our constraint and thus we avoid the problem of tuning pruning rates individually for different layers in a network. Extensive experimental results on CIFAR-10/100 and ImageNet demonstrate that our method is  highly effective, and can outperform previous state-of-the-art methods by a significant margin, especially in the high pruning rate situation. Notably, the gap of Top-1 accuracy between our ProbMask and existing methods can be up to 10\\%. As a by-product, we show ProbMask is also highly effective in identifying supermasks, which are subnetworks with high performance in a randomly weighted dense neural network.",
+    "title": "Effective Training of Sparse Neural Networks under Global Sparsity Constraint",
+    "authors": [
+      "Xiao Zhou",
+      "Weizhong Zhang",
+      "Tong Zhang"
+    ],
+    "emails": [
+      "~Tong_Zhang2",
+      "~Weizhong_Zhang1",
+      "~Xiao_Zhou4"
+    ],
+    "rank": 2071
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q1aiM7sCi1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      6
+    ],
+    "rating": "4.72",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Persistence diagrams concisely represent the topology of a point cloud whilst having strong theoretical guarantees. Most current approaches to integrating topological information into machine learning implicitly map persistence diagrams to a Hilbert space, resulting in deformation of the underlying metric structure whilst also generally requiring prior knowledge about the true topology of the space. In this paper we give an algorithm for Fuzzy c-Means (FCM) clustering directly on the space of persistence diagrams, enabling unsupervised learning that automatically captures the topological structure of data, with no prior knowledge or additional processing of persistence diagrams. We prove the same convergence guarantees as traditional FCM clustering: every convergent subsequence of iterates tends to a local minimum or saddle point. We end by presenting experiments where our fuzzy topological clustering algorithm allows for unsupervised top-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> candidate selection in settings where (i) the properties of persistence diagrams make them the natural choice over geometric equivalents, and (ii) the probabilistic membership values let us rank candidates in settings where verifying candidate suitability is expensive: lattice structure classification in materials science and pre-trained model selection in machine learning.",
+    "title": "Fuzzy c-Means Clustering for Persistence Diagrams",
+    "authors": [
+      "Thomas Davies",
+      "Jack Aspinall",
+      "Bryan Wilder",
+      "Long Tran-Thanh"
+    ],
+    "emails": [
+      "~Thomas_Davies1",
+      "~Bryan_Wilder1",
+      "materials.ox.ac.uk",
+      "warwick.ac.uk"
+    ],
+    "rank": 2072
+  },
+  {
+    "url": "https://openreview.net/forum?id=syv-WdGWvqX",
+    "ratings": [
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.72",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Domain generalization aims at training machine learning models to perform robustly across different and unseen domains. Several recent methods use multiple datasets to train models to extract domain-invariant features, hoping to generalize to unseen domains. Instead, we explicitly train domain-dependant representations by using ad-hoc batch normalization layers to collect independent domain\u2019s statistics. We propose to use these statistics to map domains in a shared latent space, where membership to a domain can be measured by means of a distance function. At test time, we project samples from an unknown domain into the same space and express their domain as a linear combination of the known ones. We apply the same mapping strategy at training and test time, learning both a latent representation and a powerful but lightweight ensemble model. We show a significant increase in classification accuracy over current state-of-the-art techniques on popular domain generalization benchmarks: PACS, Office-31 and Office-Caltech.",
+    "title": "Batch Normalization Embeddings for Deep Domain Generalization",
+    "authors": [
+      "Mattia Seg\u00f9",
+      "Alessio Tonioni",
+      "Federico Tombari"
+    ],
+    "emails": [
+      "~Federico_Tombari1",
+      "~Alessio_Tonioni1",
+      "~Mattia_Seg\u00f91"
+    ],
+    "rank": 2073
+  },
+  {
+    "url": "https://openreview.net/forum?id=tnq_O52RVbR",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.72",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We introduce the problem of explaining graph generation, formulated as controlling the generative process to produce desired graphs with explainable structures. By directing this generative process, we can explain the observed outcomes. We propose SHADOWCAST, a controllable generative model capable of mimicking networks and directing the generation, as an approach to this novel problem. The proposed model is based on a conditional generative adversarial network for graph data. We design it with the capability to control the conditions using a simple and transparent Markov model. Comprehensive experiments on three real-world network datasets demonstrate our model's competitive performance in the graph generation task. Furthermore, we control SHADOWCAST to generate graphs of different structures to show its effective controllability and explainability. As the first work to pose the problem of explaining generated graphs by controlling the generation, SHADOWCAST paves the way for future research in this exciting area.",
+    "title": "SHADOWCAST: Controllable Graph Generation with Explainability",
+    "authors": [
+      "Wesley Joon-Wie Tann",
+      "Ee-Chien Chang",
+      "Bryan Hooi"
+    ],
+    "emails": [
+      "~Wesley_Joon-Wie_Tann1",
+      "~Ee-Chien_Chang1",
+      "~Bryan_Hooi1"
+    ],
+    "rank": 2074
+  },
+  {
+    "url": "https://openreview.net/forum?id=xH251EA80go",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Deep models have achieved great success in many applications. However, vanilla deep models are not well-designed against the input perturbation.   In this work, we take an initial step to designing a simple robust layer as a lightweight plug-in for vanilla deep models.  To achieve this goal, we first propose a fast sparse coding and dictionary learning algorithm for sparse coding problem with an exact <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-sparse constraint or  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> norm regularization. Our method comes with a closed-form approximation for the sparse coding phase by taking advantage of a novel structured dictionary.   With this handy approximation,  we propose a simple sparse denoising layer (SDL) as a lightweight robust plug-in.   Extensive experiments on both classification and reinforcement learning tasks manifest the effectiveness of our methods.",
+    "title": "A Simple Sparse Denoising Layer for Robust Deep Learning",
+    "authors": [
+      "Yueming Lyu",
+      "Xingrui Yu",
+      "Ivor Tsang"
+    ],
+    "emails": [
+      "~Ivor_Tsang1",
+      "~Xingrui_Yu1",
+      "~Yueming_Lyu1"
+    ],
+    "rank": 2075
+  },
+  {
+    "url": "https://openreview.net/forum?id=ucuia1JiY9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.71",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "Clustering with constraints has gained significant attention in the field of semi-supervised machine learning as it can leverage partial prior information on a growing amount of unlabelled data. Following recent advances in deep generative models, we derive a novel probabilistic approach to constrained clustering that can be trained efficiently in the framework of stochastic gradient variational Bayes. In contrast to existing approaches, our model (CVaDE) uncovers the underlying distribution of the data conditioned on prior clustering preferences, expressed as pairwise constraints. The inclusion of such constraints allows the user to guide the clustering process towards a desirable partition of the data by indicating which samples should or should not belong to the same class. We provide extensive experiments to demonstrate that CVaDE shows superior clustering performances and robustness compared to state-of-the-art deep constrained clustering methods in a variety of data sets. We further demonstrate the usefulness of our approach on challenging real-world medical applications and face image generation.",
+    "title": "A Probabilistic Approach to Constrained Deep Clustering",
+    "authors": [
+      "Laura Manduchi",
+      "Kieran Chin-Cheong",
+      "Holger Michel",
+      "Sven Wellmann",
+      "Julia E Vogt"
+    ],
+    "emails": [
+      "~Laura_Manduchi2",
+      "~Julia_E_Vogt1",
+      "inf.ethz.ch",
+      "barmherzige-regensburg.de",
+      "klinik.uni-regensburg.de"
+    ],
+    "rank": 2076
+  },
+  {
+    "url": "https://openreview.net/forum?id=Te1aZ2myPIu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Developing certified models that can provably defense adversarial perturbations is important in machine learning security. Recently, randomized smoothing, combined with other techniques (Cohen et al., 2019; Salman et al., 2019),  has been shown to be an effective method to certify models under <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> perturbations.  Existing work for certifying <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> perturbations added the same level of Gaussian noise to each sample. The noise level determines the trade-off between the test accuracy and the average certified robust radius. We propose to further improve the defense via sample-wise randomized smoothing, which assigns different noise levels to different samples. Specifically, we propose a pretrain-to-finetune framework that first pretrains a model and then adjusts the noise levels for higher performance based on the model\u2019s outputs. For certification, we carefully allocate specific robust regions for each test sample. We perform extensive experiments on CIFAR-10 and MNIST datasets and the experimental results demonstrate that our method can achieve better accuracy-robustness trade-off in the transductive setting.",
+    "title": "Pretrain-to-Finetune Adversarial Training via Sample-wise Randomized Smoothing",
+    "authors": [
+      "Lei Wang",
+      "Runtian Zhai",
+      "Di He",
+      "Liwei Wang",
+      "Li Jian"
+    ],
+    "emails": [
+      "~Lei_Wang22",
+      "~Liwei_Wang1",
+      "~Li_Jian1",
+      "~Runtian_Zhai1",
+      "~Di_He1"
+    ],
+    "rank": 2077
+  },
+  {
+    "url": "https://openreview.net/forum?id=LFjnKhTNNQD",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Adversarial training is the industry standard for producing models that are robust to small adversarial perturbations.  However, machine learning practitioners need models that are robust to other kinds of changes that occur naturally, such as changes in the style or illumination of input images. Such changes in input distribution have been effectively modeled as shifts in the mean and variance of deep image features.   We adapt adversarial training by adversarially perturbing these feature statistics, rather than image pixels, to produce models that are robust to distributional shifts. We also visualize images from adversarially crafted distributions. Our method, Adversarial Batch Normalization (AdvBN), significantly improves the performance of ResNet-50 on ImageNet-C (+8.1%), Stylized-ImageNet (+6.7%), and ImageNet-Instagram (+3.9%) over standard training practices.  In addition, we demonstrate that AdvBN can also improve generalization on semantic segmentation.",
+    "title": "Prepare for the Worst: Generalizing across Domain Shifts with Adversarial Batch Normalization",
+    "authors": [
+      "Manli Shu",
+      "Zuxuan Wu",
+      "Micah Goldblum",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1",
+      "~Manli_Shu1",
+      "~Zuxuan_Wu1"
+    ],
+    "rank": 2078
+  },
+  {
+    "url": "https://openreview.net/forum?id=Kr7CrZPPPo",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.71",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Supervised learning models constructed under the i.i.d. assumption have often been shown to exploit spurious or brittle predictive signals instead of more robust ones present in the training data. Inspired by Quality-Diversity algorithms, in this work we train a collection of classifiers to learn distinct solutions to a classification problem, with the goal of learning to exploit a variety of predictive signals present in the training data. We propose an information-theoretic measure of model diversity based on minimizing an estimate of conditional total correlation of final layer representations across models given the label. We consider datasets with synthetically injected spurious correlations and evaluate our framework's ability to rapidly adapt to a change in distribution that destroys the spurious correlation. We compare our method to a variety of baselines under this evaluation protocol, showing that it is competitive with other approaches while being more successful at isolating distinct signals. We also show that our model is competitive with Invariant Risk Minimization under this evaluation protocol without requiring access to the environment information required by IRM to discriminate between spurious and robust signals.",
+    "title": "Learning a Non-Redundant Collection of Classifiers",
+    "authors": [
+      "Daniel Pace",
+      "Alessandra Russo",
+      "Murray Shanahan"
+    ],
+    "emails": [
+      "~Murray_Shanahan1",
+      "~Daniel_Pace1",
+      "~Alessandra_Russo1"
+    ],
+    "rank": 2079
+  },
+  {
+    "url": "https://openreview.net/forum?id=OZgVHzdKicb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Exploration in reinforcement learning is, in general, a challenging problem. In this work, we study a more tractable class of reinforcement learning problems defined by data that provides examples of successful outcome states. In this case, the reward function can be obtained automatically by training a classifier to classify states as successful or not. We argue that, with appropriate representation and regularization, such a classifier can guide a reinforcement learning algorithm to an effective solution. However, as we will show, this requires the classifier to make uncertainty-aware predictions that are very difficult with standard deep networks. To address this, we propose a novel mechanism for obtaining calibrated uncertainty based on an amortized technique for computing the normalized maximum likelihood distribution. We show that the resulting algorithm has a number of intriguing connections to both count-based exploration methods and prior algorithms for learning reward functions from data, while being able to guide algorithms towards the specified goal more effectively. We show how using amortized normalized maximum likelihood for reward inference is able to provide effective reward guidance for solving a number of challenging navigation and robotic manipulation tasks which prove difficult for other algorithms.",
+    "title": "Reinforcement Learning with Bayesian Classifiers: Efficient Skill Learning from Outcome Examples",
+    "authors": [
+      "Kevin Li",
+      "Abhishek Gupta",
+      "Vitchyr H. Pong",
+      "Ashwin Reddy",
+      "Aurick Zhou",
+      "Justin Yu",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Vitchyr_H._Pong1",
+      "~Ashwin_Reddy1",
+      "~Abhishek_Gupta1",
+      "~Aurick_Zhou1",
+      "berkeley.edu",
+      "~Sergey_Levine1"
+    ],
+    "rank": 2080
+  },
+  {
+    "url": "https://openreview.net/forum?id=JVs1OrQgR3A",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We present augmented counterfactual ordinary differential equations (ACODEs), a new approach to counterfactual inference on time series data with a focus on healthcare applications. ACODEs model interventions in continuous time with differential equations, augmented by auxiliary confounding variables to reduce inference bias. Experiments on tumor growth simulation and sepsis patient treatment response show that ACODEs outperform other methods like counterfactual Gaussian processes, recurrent marginal structural networks, and time series deconfounders in the accuracy of counterfactual inference. The learned auxiliary variables also reveal new insights into causal interventions and hidden confounders.",
+    "title": "Time Series Counterfactual Inference with Hidden Confounders",
+    "authors": [
+      "Guangyu Li",
+      "Jiahao Chen",
+      "Samuel A Assefa",
+      "Yan Liu"
+    ],
+    "emails": [
+      "jpmorgan.com",
+      "~Jiahao_Chen1",
+      "~Yan_Liu1",
+      "~Guangyu_Li2"
+    ],
+    "rank": 2081
+  },
+  {
+    "url": "https://openreview.net/forum?id=axNDkxU9-6z",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We present MDP Playground, an efficient benchmark for Reinforcement Learning (RL) algorithms with various dimensions of hardness that can be controlled independently to challenge algorithms in different ways and to obtain varying degrees of hardness in generated environments. We consider and allow control over a wide variety of key hardness dimensions, including delayed rewards, rewardable sequences, sparsity of rewards, stochasticity, image representations, irrelevant features, time unit, and action max. While it is very time consuming to run RL algorithms on standard benchmarks, we define a parameterised collection of fast-to-run toy benchmarks in OpenAI Gym by varying these dimensions. Despite their toy nature and low compute requirements, we show that these benchmarks present substantial challenges to current RL algorithms. Furthermore, since we can generate environments with a desired value for each of the dimensions, in addition to having fine-grained control over the environments' hardness, we also have the ground truth available for evaluating algorithms. Finally, we evaluate the kinds of transfer for these dimensions that may be expected from our benchmarks to more complex benchmarks. We believe that MDP Playground is a valuable testbed for researchers designing new, adaptive and intelligent RL algorithms and those wanting to unit test their algorithms.\n",
+    "title": "MDP Playground: Controlling Orthogonal Dimensions of Hardness in Toy Environments",
+    "authors": [
+      "Raghu Rajan",
+      "Jessica Lizeth Borja Diaz",
+      "Suresh Guttikonda",
+      "Fabio Ferreira",
+      "Andr\u00e9 Biedenkapp",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "~Suresh_Guttikonda2",
+      "~Frank_Hutter1",
+      "~Andr\u00e9_Biedenkapp1",
+      "~Fabio_Ferreira1",
+      "~Raghu_Rajan1",
+      "~Jessica_Lizeth_Borja_Diaz1"
+    ],
+    "rank": 2082
+  },
+  {
+    "url": "https://openreview.net/forum?id=8iW8HOidj1_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Learning and planning with latent space dynamics has been shown to be useful for sample efficiency in model-based reinforcement learning (MBRL) for discrete and continuous control tasks. In particular, recent work, for discrete action spaces, demonstrated the effectiveness of latent-space planning via Monte-Carlo Tree Search (MCTS) for bootstrapping MBRL during learning and at test time. However, the potential gains from latent-space tree search have not yet been demonstrated for environments with continuous action spaces.  In this work, we propose and explore an MBRL approach for continuous action spaces based on tree-based planning over learned latent dynamics.  We show that it is possible to demonstrate the types of bootstrapping benefits as previously shown for discrete spaces. In particular, the approach achieves improved sample efficiency and performance on a majority of challenging continuous-control benchmarks compared to the state-of-the-art. ",
+    "title": "Dream and Search to Control: Latent Space Planning for Continuous Control",
+    "authors": [
+      "Anurag Koul",
+      "Varun Kumar Vijay",
+      "Alan Fern",
+      "Somdeb Majumdar"
+    ],
+    "emails": [
+      "~Alan_Fern1",
+      "~Varun_Kumar_Vijay1",
+      "~Somdeb_Majumdar1",
+      "~Anurag_Koul1"
+    ],
+    "rank": 2083
+  },
+  {
+    "url": "https://openreview.net/forum?id=SUyxNGzUsH",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      2,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Neural module networks (NMN) have achieved success in image-grounded tasks such as question answering (QA) on synthetic images. However, very limited work on NMN has been studied in the video-grounded language tasks. These tasks extend the complexity of traditional visual tasks with the additional visual temporal variance.  Motivated by recent NMN approaches on image-grounded tasks, we introduce Visio-Linguistic Neural Module Network (VilNMN) to model the information retrieval process in video-grounded language tasks as a pipeline of neural modules. VilNMN first decomposes all language components to explicitly resolves entity references and detect corresponding action-based inputs from the question. Detected entities and actions are used as parameters to instantiate neural module networks and extract visual cues from the video. Our experiments show that VilNMN can achieve promising performance on two video-grounded language tasks: video QA and video-grounded dialogues. ",
+    "title": "VilNMN: A Neural Module Network approach to Video-Grounded Language Tasks",
+    "authors": [
+      "Hung Le",
+      "Nancy F. Chen",
+      "Steven Hoi"
+    ],
+    "emails": [
+      "~Steven_Hoi2",
+      "~Nancy_F._Chen1",
+      "~Hung_Le2"
+    ],
+    "rank": 2084
+  },
+  {
+    "url": "https://openreview.net/forum?id=0owsv3F-fM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Domain adaptation is a promising direction for deploying RL agents in real-world applications, where vision-based robotics tasks constitute an important part. Cur-rent methods that train polices on simulated images not only require a delicately crafted simulator, but also add extra burdens to the training process. In this paper, we propose a method that can learn a mapping from high-dimensional images to low-level simulator states, allowing agents trained on the source domain of state input to transfer well to the target domain of image input. By fully leveraging the sequential information in the trajectories and incorporating the policy to guide the training process, our method overcomes the intrinsic ill-posedness in cross-modal domain adaptation when structural constraints from the same modality are unavailable. Experiments on MuJoCo environments show that the policy, once combined with the mapping function,  can be deployed directly in the target domain with only a small performance gap, while current methods designed for same-modal domain adaptation fail on this problem.",
+    "title": "Cross-Modal Domain Adaptation for Reinforcement Learning",
+    "authors": [
+      "Xiong-Hui Chen",
+      "Shengyi Jiang",
+      "Feng Xu",
+      "Yang Yu"
+    ],
+    "emails": [
+      "~Feng_Xu6",
+      "~Shengyi_Jiang2",
+      "~Xiong-Hui_Chen1",
+      "~Yang_Yu5"
+    ],
+    "rank": 2085
+  },
+  {
+    "url": "https://openreview.net/forum?id=GtiDFD1pxpz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We present a novel machine learning architecture that uses a single high-dimensional nonlinearity consisting of the exponential of a single input-dependent matrix. The mathematical simplicity of this architecture allows a detailed analysis of its behaviour, providing robustness guarantees via Lipschitz bounds. Despite its simplicity, a single matrix exponential layer already provides universal approximation properties and can learn and extrapolate fundamental functions of the input, such as periodic structure or geometric invariants. This architecture outperforms other general-purpose architectures on benchmark problems, including CIFAR-10, using fewer parameters.",
+    "title": "Intelligent Matrix Exponentiation",
+    "authors": [
+      "Thomas Fischbacher",
+      "Iulia Maria Comsa",
+      "Krzysztof Potempa",
+      "Moritz Firsching",
+      "Luca Versari",
+      "Jyrki Alakuijala"
+    ],
+    "emails": [
+      "~Iulia_Maria_Comsa1",
+      "gmail.com",
+      "~Luca_Versari1",
+      "~Moritz_Firsching1",
+      "~Thomas_Fischbacher1",
+      "~Jyrki_Alakuijala1"
+    ],
+    "rank": 2086
+  },
+  {
+    "url": "https://openreview.net/forum?id=QcjTNc_afvH",
+    "ratings": [
+      5,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.71",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Generative adversarial networks (GANs) have been successful in synthesizing and manipulating synthetic but realistic images from latent vectors. However, it is still challenging for GANs to manipulate real images, especially in real-time. State-of-the-art GAN-based methods for editing real images suffer from time-consuming operations in projecting real images to latent vectors. Alternatively, an encoder can be trained to embed real images to the latent space instantly, but it loses details drastically. We propose StyleMapGAN, which adopts a novel representation of latent space, called stylemap, incorporating spatial dimensions into embedding. Because each spatial location in the stylemap contributes to its corresponding region of the generated images, the real-time projection through the encoder becomes accurate as well as editing real images becomes spatially controllable. Experimental results demonstrate that our method significantly outperforms state-of-the-art models in various image manipulation tasks such as local editing and image interpolation. Especially, detailed comparisons show that our local editing method successfully reflects not only the color and texture but also the shape of a reference image while preserving untargeted regions. ",
+    "title": "A StyleMap-Based Generator for Real-Time Image Projection and Local Editing",
+    "authors": [
+      "Hyunsu Kim",
+      "Yunjey Choi",
+      "Junho Kim",
+      "Sungjoo Yoo",
+      "Youngjung Uh"
+    ],
+    "emails": [
+      "~Hyunsu_Kim1",
+      "~Junho_Kim3",
+      "~Sungjoo_Yoo1",
+      "~Youngjung_Uh2",
+      "~Yunjey_Choi3"
+    ],
+    "rank": 2087
+  },
+  {
+    "url": "https://openreview.net/forum?id=cotg54BSX8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      3,
+      4
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Model extraction attacks attempt to replicate a target machine learning model from predictions obtained by querying its inference API. Most existing attacks on Deep Neural Networks achieve this by supervised training of the copy using the victim's predictions. An emerging class of attacks exploit algebraic properties of DNNs to obtain high-fidelity copies using orders of magnitude fewer queries than the prior state-of-the-art. So far, such powerful attacks have been limited to networks with few hidden layers and ReLU activations.\n\nIn this paper we present algebraic attacks on large-scale natural language models in a grey-box setting, targeting models with a pre-trained (public) encoder followed by a single (private) classification layer. Our key observation is that a small set of arbitrary embedding vectors is likely to form a basis of the classification layer's input space, which a grey-box adversary can compute. We show how to use this information to solve an equation system that determines the classification layer from the corresponding probability outputs.\n\nWe evaluate the effectiveness of our attacks on different sizes of transformer models and downstream tasks. Our key findings are that (i) with frozen base layers, high-fidelity extraction is possible with a number of queries that is as small as twice the input dimension of the last layer. This is true even for queries that are entirely in-distribution, making extraction attacks indistinguishable from legitimate use; (ii) with fine-tuned base layers, the effectiveness of algebraic attacks decreases with the learning rate, showing that fine-tuning is not only beneficial for accuracy but also indispensable for model confidentiality.",
+    "title": "Grey-box Extraction of Natural Language Models",
+    "authors": [
+      "Santiago Zanella-Beguelin",
+      "Shruti Tople",
+      "Andrew Paverd",
+      "Boris K\u00f6pf"
+    ],
+    "emails": [
+      "~Santiago_Zanella-Beguelin1",
+      "~Shruti_Tople2",
+      "microsoft.com",
+      "~Boris_K\u00f6pf1"
+    ],
+    "rank": 2088
+  },
+  {
+    "url": "https://openreview.net/forum?id=uRKqXoN-Ic9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      2,
+      7,
+      5
+    ],
+    "rating": "4.71",
+    "confidences": [
+      3,
+      5,
+      4,
+      2
+    ],
+    "abstract": "Robustness to adversarial perturbations and accurate uncertainty estimation are crucial for reliable application of deep learning in real world settings. Dirichlet-based uncertainty (DBU) models are a family of models that predict the parameters of a Dirichlet distribution (instead of a categorical one) and promise to signal when not to trust their predictions. Untrustworthy predictions are obtained on unknown or ambiguous samples and marked with a high uncertainty by the models.\n\nIn this work, we show that DBU models with standard training are not robust w.r.t. three important tasks in the field of uncertainty estimation. First, we evaluate how useful the uncertainty estimates are to (1) indicate correctly classified samples. Our results show that while they are a good indicator on unperturbed data, performance on perturbed data decreases dramatically. (2) We evaluate if uncertainty estimates are able to detect adversarial examples that try to fool classification. It turns out that uncertainty estimates are able to detect FGSM attacks but not able to detect PGD attacks. We further evaluate the reliability of DBU models on the task of (3) distinguishing between in-distribution (ID) and out-of-distribution (OOD) data. To this end, we present the first study of certifiable robustness for DBU models. Furthermore, we propose novel uncertainty attacks that fool models into assigning high confidence to OOD data and low confidence to ID data, respectively. \nBoth approaches show that detecting OOD samples and distinguishing between ID-data and OOD-data is not robust.\n\nBased on our results, we explore the first approaches to make DBU models more robust. We use adversarial training procedures based on label attacks, uncertainty attacks, or random noise and demonstrate how they affect robustness of DBU models on ID data and OOD data.",
+    "title": "Evaluating Robustness of Predictive Uncertainty Estimation: Are Dirichlet-based Models Reliable?",
+    "authors": [
+      "Anna-Kathrin Kopetzki",
+      "Bertrand Charpentier",
+      "Daniel Z\u00fcgner",
+      "Sandhya Giri",
+      "Stephan G\u00fcnnemann"
+    ],
+    "emails": [
+      "in.tum.de",
+      "~Daniel_Z\u00fcgner1",
+      "~Bertrand_Charpentier2",
+      "~Stephan_G\u00fcnnemann1",
+      "~Anna-Kathrin_Kopetzki1"
+    ],
+    "rank": 2089
+  },
+  {
+    "url": "https://openreview.net/forum?id=DdGCxq9C_Gr",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      6
+    ],
+    "rating": "4.71",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "A World Model is a generative model used to simulate an environment. World Models have proven capable of learning spatial and temporal representations of Reinforcement Learning environments. In some cases, a World Model offers an agent the opportunity to learn entirely inside of its own dream environment. In this work we explore improving the generalization capabilities from dream environments to reality (Dream2Real). We present a general approach to improve a controller's ability to transfer from a neural network dream environment to reality at little additional cost. These improvements are gained by drawing on inspiration from domain randomization, where the basic idea is to randomize as much of a simulator as possible without fundamentally changing the task at hand. Generally, domain randomization assumes access to a pre-built simulator with configurable parameters but oftentimes this is not available. By training the World Model using dropout, the dream environment is capable of creating a nearly infinite number of \\textit{different} dream environments. Our experimental results show that Dropout's Dream Land is an effective technique to bridge the reality gap between dream environments and reality. Furthermore, we additionally perform an extensive set of ablation studies. ",
+    "title": "Dropout's Dream Land: Generalization from Learned Simulators to Reality",
+    "authors": [
+      "Zac Wellmer",
+      "James Kwok"
+    ],
+    "emails": [
+      "~James_Kwok1",
+      "~Zac_Wellmer1"
+    ],
+    "rank": 2090
+  },
+  {
+    "url": "https://openreview.net/forum?id=DMxOBm06HUx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      7,
+      3,
+      5
+    ],
+    "rating": "4.70",
+    "confidences": [
+      5,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Pre-trained language models such as BERT have exhibited remarkable performances in many tasks in natural language understanding (NLU). The tokens in the models are usually fine-grained in the sense that for languages like English they are words or sub-words and for languages like Chinese they are characters. In English, for example, there are multi-word expressions which form natural lexical units and thus the use of coarse-grained tokenization also appears to be reasonable. In fact, both fine-grained and coarse-grained tokenizations have advantages and disadvantages for learning of pre-trained language models. In this paper, we propose a novel pre-trained language model, referred to as AMBERT (A Multi-grained BERT), on the basis of both fine-grained and coarse-grained tokenizations. For English, AMBERT takes both the sequence of words (fine-grained tokens) and the sequence of phrases (coarse-grained tokens) as input after tokenization, employs one encoder for processing the sequence of words and the other encoder for processing the sequence of the phrases, utilizes shared parameters between the two encoders, and finally creates a sequence of contextualized representations of the words and a sequence of contextualized representations of the phrases. Experiments have been conducted on benchmark datasets for Chinese and English, including CLUE, GLUE, SQuAD and RACE. The results show that AMBERT outperforms the existing best performing models in almost all cases, particularly the improvements are significant for Chinese. We also develop a version of AMBERT which performs equally well as AMBERT but uses about half of its inference time.",
+    "title": "AMBERT: A Pre-trained Language Model with Multi-Grained Tokenization",
+    "authors": [
+      "Xinsong Zhang",
+      "Hang Li"
+    ],
+    "emails": [
+      "~Hang_Li4",
+      "~Xinsong_Zhang1"
+    ],
+    "rank": 2091
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y45i-hDynr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We present a novel graph convolutional layer that is fast, conceptually simple, and provides high accuracy with reduced overfitting. Based on pseudo-differential operators, our layer operates on graphs with relative position information available for each pair of connected nodes. We evaluate our method on a variety of supervised learning tasks, including superpixel image classification using the MNIST, CIFAR10, and CIFAR100 superpixel datasets, node correspondence using the FAUST dataset, and shape classification using the ModelNet10  dataset. The new layer outperforms multiple recent architectures on superpixel image classification tasks using the MNIST and CIFAR100 superpixel datasets and performs comparably with recent results on the CIFAR10 superpixel dataset. We measure test accuracy without bias to the test set by selecting the model with the best training accuracy. The new layer achieves a test error rate of 0.80% on the MNIST superpixel dataset, beating the closest reported rate of 0.95% by a factor of more than 15%. After dropping roughly 70% of the edge connections from the input by performing a Delaunay triangulation, our model still achieves a competitive error rate of 1.04%.",
+    "title": "Parameterized Pseudo-Differential Operators for Graph Convolutional Neural Networks",
+    "authors": [
+      "Kevin M. Potter",
+      "Steven Richard Sleder",
+      "Matthew David Smith",
+      "John Tencer"
+    ],
+    "emails": [
+      "sandia.gov",
+      "~Kevin_M._Potter1"
+    ],
+    "rank": 2092
+  },
+  {
+    "url": "https://openreview.net/forum?id=HNA0kUAFdbv",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.70",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Layout representation, which models visual elements in a canvas and their inter-relations, plays a crucial role in graphic design intelligence.\nWith a large variety of layout designs and the unique characteristic of layouts that visual elements are defined as a list of categorical (e.g. shape type) and numerical (e.g. position and size) properties, it is challenging to learn a general and compact representation with limited data. Inspired by the recent success of self-supervised pre-training techniques in various natural language processing tasks, in this paper, we propose CanvasEmb (Canvas Embedding), which pre-trains deep representation from unlabeled graphic designs by jointly conditioning on all the context elements in the same canvas, with a multi-dimensional feature encoder and a multi-task learning objective. The pre-trained CanvasEmb model can be fine-tuned with just one additional output layer and with a small size of training data to create models for a wide range of downstream tasks. We verify our approach with presentation slides data. We construct a large-scale dataset with more than one million slides, and propose two novel layout understanding tasks with human labeling sets, namely element role labeling and image captioning. Evaluation results on these two tasks show that our model with fine-tuning achieves state-of-the-art performances. Furthermore, we conduct a deep analysis aiming to understand the modeling mechanism of CanvasEmb, and demonstrate its great potential use on more applications such as layout auto completion and layout retrieval.",
+    "title": "CANVASEMB: Learning Layout Representation with Large-scale Pre-training for Graphic Design",
+    "authors": [
+      "Yuxi Xie",
+      "Danqing Huang",
+      "Jinpeng Wang",
+      "Chin-Yew Lin"
+    ],
+    "emails": [
+      "~Danqing_Huang1",
+      "gmail.com",
+      "~Chin-Yew_Lin1",
+      "~Yuxi_Xie1"
+    ],
+    "rank": 2093
+  },
+  {
+    "url": "https://openreview.net/forum?id=6c6KZUdm1Nq",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "We address a regression problem from weakly labeled data that are correctly labeled only above a regression line, i.e., upper one-side labeled data.\nThe label values of the data are the results of sensing the magnitude of some phenomenon.\nIn this case, the labels often contain missing or incomplete observations whose values are lower than those of correct observations and are also usually lower than the regression line. It follows that data labeled with lower values than the estimations of a regression function (lower-side data) are mixed with data that should originally be labeled above the regression line (upper-side data).\nWhen such missing label observations are observed in a non-negligible amount, we thus should assume our lower-side data to be unlabeled data that are a mix of original upper- and lower-side data.\nWe formulate a regression problem from these upper-side labeled and lower-side unlabeled data. We then derive a learning algorithm in an unbiased and consistent manner to ordinary regression that is learned from data labeled correctly in both upper- and lower-side cases. Our key idea is that we can derive a gradient that requires only upper-side data and unlabeled data as the equivalent expression of that for ordinary regression. We additionally found that a specific class of losses enables us to learn unbiased solutions practically. In numerical experiments on synthetic and real-world datasets, we demonstrate the advantages of our algorithm.",
+    "title": "Regression from Upper One-side Labeled Data",
+    "authors": [
+      "Takayuki Katsuki"
+    ],
+    "emails": [
+      "~Takayuki_Katsuki2"
+    ],
+    "rank": 2094
+  },
+  {
+    "url": "https://openreview.net/forum?id=O358nrve1W",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.69",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "The ability to synthesise source code from input/output examples allows nonexperts to generate programs, and experts to abstract away a wide range of simple programming tasks. Current research in this area has explored neural synthesis, SMT solvers, and genetic programming; each of these approaches is limited, however, often using highly specialised target languages for synthesis. In this paper we present a novel hybrid approach using neural networks to guide genetic programming (GP), which allows us to successfully synthesise code from just ten I/O examples in a generalised Turing complete target language, up to and including a sorting algorithm. We show that GP by itself is able to synthesise a set of simple programs, and show which hints (suggested lines of code for inclusion) are of most utility to GP in solving harder problems. Using a form of unstructured curriculum learning, we then demonstrate that neural networks can be used to determine when to make use of these high-utility hints for specific I/O problems and thus enable complex functions to be successfully synthesised. We apply our approach to two different problem sets: common array-to-array programs (including sorting), and a canvas drawing problem set inspired by So &amp; Oh (2018).",
+    "title": "Neurally Guided Genetic Programming for Turing Complete Programming by Example",
+    "authors": [
+      "Alexander Newton Wild",
+      "Barry Porter"
+    ],
+    "emails": [
+      "~Barry_Porter1",
+      "~Alexander_Newton_Wild1"
+    ],
+    "rank": 2095
+  },
+  {
+    "url": "https://openreview.net/forum?id=j6rILItz4yr",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Adversarial training is an effective method to combat adversarial attacks in order to create robust neural networks. By using an auxiliary batch normalization on adversarial examples, it has been shown recently to possess great potential in improving the generalization ability of neural networks for image recognition as well. However, crafting pixel-level adversarial perturbations is computationally expensive. To address this issue, we propose AdversariaL Feature Augmentation (ALFA), which advocates adversarial training on the intermediate layers of feature embeddings. ALFA utilizes both clean and adversarial augmented features jointly to enhance standard trained networks. To eliminate laborious tuning of key parameters such as locations and strength of feature augmentations, we further design a learnable adversarial feature augmentation (L-ALFA) framework to automatically adjust the perturbation magnitude of each perturbed feature. Extensive experiments demonstrate that our proposed ALFA and L-ALFA methods achieve significant and consistent generalization improvement over strong baselines on CIFAR-10, CIFAR-100, and ImageNet benchmarks across different backbone networks for image recognition. ",
+    "title": "ALFA: Adversarial Feature Augmentation for Enhanced Image Recognition",
+    "authors": [
+      "Tianlong Chen",
+      "Yu Cheng",
+      "Zhe Gan",
+      "Yu Hu",
+      "Zhangyang Wang",
+      "Jingjing Liu"
+    ],
+    "emails": [
+      "~Jingjing_Liu2",
+      "~Zhangyang_Wang1",
+      "~Tianlong_Chen1",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1",
+      "~Yu_Hu3"
+    ],
+    "rank": 2096
+  },
+  {
+    "url": "https://openreview.net/forum?id=67ChnrC0ybo",
+    "ratings": [
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.69",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Despite the great advantage in search efficiency, DARTS often suffers weak stability, which reflects in the large variation among individual trials as well as the sensitivity to the hyper-parameters of the search process. This paper owes such instability to an optimization gap between the super-network and its sub-networks, namely, improving the validation accuracy of the super-network does not necessarily lead to a higher expectation on the performance of the sampled sub-networks. Then, we point out that the gap is due to the inaccurate estimation of the architectural gradients, based on which we propose an amended estimation method. Mathematically, our method guarantees a bounded error from the true gradients while the original estimation does not. Our approach bridges the gap from two aspects, namely, amending the estimation on the architectural gradients, and unifying the hyper-parameter settings in the search and re-training stages. Experiments on CIFAR10, ImageNet, and Penn Treebank demonstrate that our approach largely improves search stability and, more importantly, enables DARTS-based approaches to explore much larger search spaces that have not been investigated before.\n",
+    "title": "Stabilizing DARTS with Amended Gradient Estimation on Architectural Parameters",
+    "authors": [
+      "Kaifeng Bi",
+      "Lingxi Xie",
+      "Changping Hu",
+      "Xin Chen",
+      "Longhui Wei",
+      "Qi Tian"
+    ],
+    "emails": [
+      "mails.tsinghua.edu.cn",
+      "~Longhui_Wei1",
+      "huawei.com",
+      "~Qi_Tian3",
+      "~Lingxi_Xie1"
+    ],
+    "rank": 2097
+  },
+  {
+    "url": "https://openreview.net/forum?id=XOuAOv_-5Fx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Various metrics have recently been proposed to measure uncertainty calibration of deep models for classification. However, these metrics either fail to capture miscalibration correctly or lack interpretability. We propose to use the normalized entropy as a measure of uncertainty and derive the Uncertainty Calibration Error (UCE), a comprehensible calibration metric for multi-class classification. In our experiments, we focus on uncertainty from variational Bayesian inference methods and compare UCE to established calibration errors on the task of multi-class image classification. UCE avoids several pathologies of other metrics, but does not sacrifice interpretability. It can be used for regularization to improve calibration during training without penalizing predictions with justified high confidence.",
+    "title": "Uncertainty Calibration Error: A New Metric for Multi-Class Classification",
+    "authors": [
+      "Max-Heinrich Laves",
+      "Sontje Ihler",
+      "Karl-Philipp Kortmann",
+      "Tobias Ortmaier"
+    ],
+    "emails": [
+      "~Max-Heinrich_Laves1",
+      "~Sontje_Ihler1",
+      "imes.uni-hannover.de"
+    ],
+    "rank": 2098
+  },
+  {
+    "url": "https://openreview.net/forum?id=0z1HScLBEpb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      4,
+      5,
+      5,
+      2
+    ],
+    "abstract": "This paper focuses on cooperative value-based multi-agent reinforcement learning (MARL) in the paradigm of centralized training with decentralized execution (CTDE). Current state-of-the-art value-based MARL methods leverage CTDE to learn a centralized joint-action value function as a monotonic mixing of each agent's utility function, which enables easy decentralization. However, this monotonic restriction leads to inefficient exploration in tasks with nonmonotonic returns due to suboptimal approximations of the values of joint actions. To address this, we present a novel MARL approach called Universal Value Exploration (UneVEn), which uses universal successor features (USFs) to learn policies of tasks related to the target task, but with simpler reward functions in a sample efficient manner. UneVEn uses novel action-selection schemes between randomly sampled related tasks during exploration, which enables the monotonic joint-action value function of the target task to place more importance on useful joint actions. Empirical results on a challenging cooperative predator-prey task requiring significant coordination amongst agents show that UneVEn significantly outperforms state-of-the-art baselines.",
+    "title": "UneVEn: Universal Value Exploration for Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Tarun Gupta",
+      "Anuj Mahajan",
+      "Bei Peng",
+      "Wendelin Boehmer",
+      "Shimon Whiteson"
+    ],
+    "emails": [
+      "~Shimon_Whiteson1",
+      "~Wendelin_Boehmer1",
+      "~Tarun_Gupta3",
+      "~Anuj_Mahajan1",
+      "~Bei_Peng2"
+    ],
+    "rank": 2099
+  },
+  {
+    "url": "https://openreview.net/forum?id=c8X4F8jyxo0",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.69",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Multi-output Gaussian processes (MOGPs) introduce correlations between outputs, but are subject to negative transfer, where learned correlations associate an output with another that is actually unrelated, leading to diminished predictive accuracy. Negative transfer may be countered by structuring the MOGP to follow conditional independence statements so that independent outputs do not correlate, but to date the imposed structures have been hand selected for specific applications. We introduce the DAG-GP model, which linearly combines latent Gaussian processes so that MOGP outputs follow a directed acyclic graph structure. Our method exposes a deep connection between MOGPs and Gaussian directed graphical models, which has not been explored explicitly in prior work. We propose to learn the graph from data prior to training the MOGP, so that correlations between outputs are only introduced when justified. Automated structure learning means no prior knowledge of the conditional independence between outputs is required, and training multiple MOGPs to identify the best structure is no longer necessary. Graph structure is learned by applying existing structure learning algorithms developed for graphical models to a downselected set of MOGP training data. Experiments on real world data sets show that with sufficiently expressive kernels, prediction error and likelihood are improved when using the DAG-GP model compared to state of the art exact MOGP methods.   ",
+    "title": "DAG-GPs: Learning Directed Acyclic Graph Structure For Multi-Output Gaussian Processes",
+    "authors": [
+      "Benjamin J. Ayton",
+      "Brian Williams"
+    ],
+    "emails": [
+      "~Brian_Williams1",
+      "~Benjamin_J._Ayton1"
+    ],
+    "rank": 2100
+  },
+  {
+    "url": "https://openreview.net/forum?id=yN18f9V1Onp",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      4,
+      3,
+      2,
+      2,
+      2
+    ],
+    "abstract": "In multi-agent reinforcement learning (MARL), the learning rates of actors and critic are mostly hand-tuned and fixed. This not only requires heavy tuning but more importantly limits the learning. With adaptive learning rates according to gradient patterns, some optimizers have been proposed for general optimizations, which however do not take into consideration the characteristics of MARL. In this paper, we propose AdaMa to bring adaptive learning rates to cooperative MARL. AdaMa evaluates the contribution of actors' updates to the improvement of Q-value and adaptively updates the learning rates of actors to the direction of maximally improving the Q-value. AdaMa could also dynamically balance the learning rates between the critic and actors according to their varying effects on the learning. Moreover, AdaMa can incorporate the second-order approximation to capture the contribution of pairwise actors' updates and thus more accurately updates the learning rates of actors. Empirically, we show that AdaMa could accelerate the learning and improve the performance in a variety of multi-agent scenarios, and the visualizations of learning rates during training clearly explain how and why AdaMa works.",
+    "title": "Adaptive Learning Rates for Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Jiechuan Jiang",
+      "Zongqing Lu"
+    ],
+    "emails": [
+      "~Jiechuan_Jiang1",
+      "~Zongqing_Lu2"
+    ],
+    "rank": 2101
+  },
+  {
+    "url": "https://openreview.net/forum?id=TfscevJuPNY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      7,
+      3
+    ],
+    "rating": "4.69",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Training modern neural networks is an inherently noisy process that can lead to high \\emph{prediction churn}-- disagreements between re-trainings of the same model due to factors such as randomization in the parameter initialization and mini-batches-- even when the trained models all attain high accuracies. Such prediction churn can be very undesirable in practice. In this paper, we present several baselines for reducing churn and show that utilizing the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-NN predictions to smooth the labels results in a new and principled method that often outperforms the baselines on churn while improving accuracy on a variety of benchmark classification tasks and model architectures.",
+    "title": "Deep k-NN Label Smoothing Improves Reproducibility of Neural Network Predictions",
+    "authors": [
+      "Dara Bahri",
+      "Heinrich Jiang"
+    ],
+    "emails": [
+      "~Dara_Bahri1",
+      "~Heinrich_Jiang1"
+    ],
+    "rank": 2102
+  },
+  {
+    "url": "https://openreview.net/forum?id=LMslR3CTzE_",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      5,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Subgraph matching is the problem of determining the presence and location(s) of a given query graph in a large target graph. \nDespite being an NP-complete problem, the subgraph matching problem is crucial in domains ranging from network science and database systems to biochemistry and cognitive science. \nHowever, existing techniques based on combinatorial matching and integer programming cannot handle matching problems with both large target and query graphs.\nHere we propose NeuroMatch, an accurate, efficient, and robust neural approach to subgraph matching. NeuroMatch decomposes query and target graphs into small subgraphs and embeds them using graph neural networks. Trained to capture geometric constraints corresponding to subgraph relations, NeuroMatch then efficiently performs subgraph matching directly in the embedding space. Experiments demonstrate NeuroMatch is 100x faster than existing combinatorial approaches and 18% more accurate than existing approximate subgraph matching methods.",
+    "title": "Neural Subgraph Matching",
+    "authors": [
+      "Zhitao Ying",
+      "Andrew Wang",
+      "Jiaxuan You",
+      "Chengtao Wen",
+      "Arquimedes Canedo",
+      "Jure Leskovec"
+    ],
+    "emails": [
+      "~Jiaxuan_You2",
+      "~Jure_Leskovec1",
+      "cs.stanford.edu",
+      "siemens.com",
+      "~Zhitao_Ying1"
+    ],
+    "rank": 2103
+  },
+  {
+    "url": "https://openreview.net/forum?id=T3MCvI0gqEV",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Learning visual knowledge from massive weakly-labeled web videos has attracted growing research interests thanks to the large corpus of easily accessible video data on the Internet. However, for video action recognition, the action of interest might only exist in arbitrary clips of untrimmed web videos, resulting in high label noises in the temporal space. To address this issue, we introduce a new method for pre-training video action recognition models using queried web videos. Instead of trying to filter out, we propose to convert the potential noises in these queried videos to useful supervision signals by defining the concept of Sub-Pseudo Label (SPL). Specifically, SPL spans out a new set of meaningful \"middle ground\" label space constructed by extrapolating the original weak labels during video querying and the prior knowledge distilled from a teacher model. Consequently, SPL provides enriched supervision for video models to learn better representations for downstream tasks. We validate the effectiveness of our method on four video action recognition datasets and a weakly-labeled image dataset to study the generalization ability. Experiments show that SPL outperforms several existing pre-training strategies using pseudo-labels and achieves competitive results on HMDB-51 and UCF-101 datasets compared with recent pre-training methods.",
+    "title": "Exploring Sub-Pseudo Labels for Learning from Weakly-Labeled Web Videos",
+    "authors": [
+      "Kunpeng Li",
+      "Zizhao Zhang",
+      "Guanhang Wu",
+      "Xuehan Xiong",
+      "Chen-Yu Lee",
+      "Yun Fu",
+      "Tomas Pfister"
+    ],
+    "emails": [
+      "~Xuehan_Xiong1",
+      "~Zizhao_Zhang3",
+      "~Kunpeng_Li1",
+      "~Yun_Fu1",
+      "~Guanhang_Wu1",
+      "~Tomas_Pfister1",
+      "~Chen-Yu_Lee2"
+    ],
+    "rank": 2104
+  },
+  {
+    "url": "https://openreview.net/forum?id=pQhnag-dIt",
+    "ratings": [
+      4,
+      3,
+      7,
+      5
+    ],
+    "rating": "4.69",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Attention-based methods have played an important role in model interpretations, where the calculated attention weights are expected to highlight the critical parts of inputs (e.g., keywords in sentences). However, recent research points out that attention-as-importance interpretations often do not work as well as we expect. For example, learned attention weights sometimes highlight less meaningful tokens like \"[SEP]\", \",\", and \".\", and are frequently uncorrelated with other feature importance indicators like gradient-based measures. Finally, a debate on the effectiveness of attention-based interpretations has raised. In this paper, we reveal that one root cause of this phenomenon can be ascribed to the combinatorial shortcuts, which stand for that the models may not only obtain information from the highlighted parts, but also from the attention weights themselves, as a result, the attention weights are no longer pure importance indicators. We analyze the combinatorial shortcuts theoretically, design one intuitive experiment to demonstrate their existence, and propose two methods to mitigate this issue. Empirical studies on attention-based interpretation models are conducted, and the results show that the proposed methods can effectively improve the interpretability of attention mechanisms on a variety of datasets.",
+    "title": "Why is Attention Not So Interpretable?",
+    "authors": [
+      "Bing Bai",
+      "Jian Liang",
+      "Guanhua Zhang",
+      "Hao Li",
+      "Kun Bai",
+      "Fei Wang"
+    ],
+    "emails": [
+      "~Hao_Li15",
+      "~Guanhua_Zhang1",
+      "~Bing_Bai2",
+      "~Jian_Liang3",
+      "~Fei_Wang3",
+      "~Kun_Bai1"
+    ],
+    "rank": 2105
+  },
+  {
+    "url": "https://openreview.net/forum?id=iMKvxHlrZb3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3,
+      6
+    ],
+    "rating": "4.68",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) are a popular class of parametric model for learning over graph-structured data. Recent work has argued that GNNs primarily use the graph for feature smoothing, and have shown competitive results on benchmark tasks by simply operating on graph-smoothed node features, rather than using end-to-end learned feature hierarchies that are challenging to scale to large graphs. In this work, we ask whether these results can be extended to heterogeneous graphs, which encode multiple types of relationship between different entities. We propose Neighbor Averaging over Relation Subgraphs (NARS), which trains a classifier on neighbor-averaged features for randomly-sampled subgraphs of the \u2018metagraph\u2018 of relations. We describe optimizations to allow these sets of node features to be computed in a memory-efficient way, both at training and inference time. NARS achieves a new state of the art accuracy on several benchmark datasets, outperforming more expensive GNN-based methods.",
+    "title": "Scalable Graph Neural Networks for Heterogeneous Graphs",
+    "authors": [
+      "Lingfan Yu",
+      "Jiajun Shen",
+      "Jinyang Li",
+      "Adam Lerer"
+    ],
+    "emails": [
+      "~Jinyang_Li1",
+      "~Jiajun_Shen1",
+      "~Lingfan_Yu1",
+      "~Adam_Lerer1"
+    ],
+    "rank": 2106
+  },
+  {
+    "url": "https://openreview.net/forum?id=kuqBCnJuD4Z",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We consider federated learning with multiple wireless edge servers having their own local coverages. We focus on speeding up training in this increasingly practical setup. Our key idea is to utilize the devices located in the overlapping areas between the coverage of edge servers; in the model-downloading stage, the devices in the overlapping areas receive multiple models from different edge servers, take the average of the received models, and then update the model with their local data. These devices send their updated model to multiple edge servers by broadcasting, which acts as bridges for sharing the trained models between servers. Even when some edge servers are given biased datasets within their coverages, their training processes can be assisted by coverages of adjacent servers, through the devices in the overlapping regions. As a result, the proposed scheme does not require costly communications with the central cloud server (located at the higher tier of edge servers) for model synchronization, significantly reducing the overall training time compared to the conventional cloud-based federated learning systems. Extensive experimental results show remarkable performance gains of our scheme compared to existing methods.",
+    "title": "FedMes: Speeding Up Federated Learning with Multiple Edge Servers",
+    "authors": [
+      "Dong-Jun Han",
+      "Minseok Choi",
+      "Jungwuk Park",
+      "Jaekyun Moon"
+    ],
+    "emails": [
+      "jejunu.ac.kr",
+      "~Jaekyun_Moon2",
+      "~Dong-Jun_Han1",
+      "kaist.ac.kr"
+    ],
+    "rank": 2107
+  },
+  {
+    "url": "https://openreview.net/forum?id=poH5qibNFZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "End-to-end training with back propagation is the standard method for training deep neural networks. However, as networks become deeper and bigger, end-to-end training becomes more challenging: highly non-convex models gets stuck easily in local optima, gradients signals are prone to vanish or explode during backpropagation, training requires computational resources and time. \n\nIn this work, we propose to break away from the end-to-end paradigm in the context of Knowledge Distillation. Instead of distilling a model end-to-end, we propose to split it into smaller sub-networks - also called neighbourhoods - that are then trained independently. We empirically show that distilling networks in a non end-to-end fashion can be beneficial in a diverse range of use cases. First, we show that it speeds up Knowledge Distillation by exploiting parallelism and training on smaller networks. Second, we show that independently distilled neighbourhoods may be efficiently re-used for Neural Architecture Search. Finally, because smaller networks model simpler functions, we show that they are easier to train with synthetic data than their deeper counterparts.",
+    "title": "Neighbourhood Distillation: On the benefits of non end-to-end distillation",
+    "authors": [
+      "La\u00ebtitia Shao",
+      "Elad Eban",
+      "Yair Movshovitz-Attias"
+    ],
+    "emails": [
+      "~Yair_Movshovitz-Attias1",
+      "~La\u00ebtitia_Shao1",
+      "~Elad_Eban1"
+    ],
+    "rank": 2108
+  },
+  {
+    "url": "https://openreview.net/forum?id=4dFyyAdWbis",
+    "ratings": [
+      2,
+      6,
+      6
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "Breakthroughs in unsupervised domain adaptation (uDA) have opened up the possibility of adapting models from a label-rich source domain to unlabeled target domains. Prior uDA works have primarily focused on improving adaptation accuracy between the given source and target domains, and considerably less attention has been paid to the challenges that arise when uDA is deployed in practical settings. This paper puts forth a novel and complementary perspective, and investigates the algorithmic challenges that arise when uDA is deployed in a distributed ML system with multiple target domains. We propose two algorithms: i) a Collaborator Selection algorithm which selects an optimal collaborator for each target domain, and makes uDA systems more accurate and flexible; ii) a distributed training strategy that allows adversarial uDA algorithms to train in a privacy-preserving manner. We provide theoretical justifications and empirical results to show that our solution significantly boosts the performance of uDA in practical settings.",
+    "title": "Scaling Unsupervised Domain Adaptation through Optimal Collaborator Selection and Lazy Discriminator Synchronization",
+    "authors": [
+      "Akhil Mathur",
+      "Shaoduo Gan",
+      "Anton Isopoussu",
+      "Fahim Kawsar",
+      "Nadia Berthouze",
+      "Nicholas Donald Lane"
+    ],
+    "emails": [
+      "nokia-bell-labs.com",
+      "~Anton_Isopoussu1",
+      "~Akhil_Mathur1",
+      "~Nicholas_Donald_Lane1",
+      "~Nadia_Berthouze1",
+      "~Shaoduo_Gan1"
+    ],
+    "rank": 2109
+  },
+  {
+    "url": "https://openreview.net/forum?id=reEp2BReEou",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Normalization plays an important role in the optimization of deep neural networks. While there are standard normalization methods in computer vision and natural language processing, there is limited understanding of how to effectively normalize neural networks for graph representation learning. In this paper, we propose a principled normalization method, Graph Normalization (GraphNorm), where the key idea is to normalize the feature values across all nodes for each individual graph with a learnable shift. Theoretically, we show that GraphNorm serves as a preconditioner that smooths the distribution of the graph aggregation's spectrum, leading to faster optimization. Such an improvement cannot be well obtained if we use currently popular normalization methods, such as BatchNorm, which normalizes the nodes in a batch rather than in individual graphs, due to heavy batch noises. Moreover, we show that for some highly regular graphs, the mean of the feature values contains graph structural information, and directly subtracting the mean may lead to an expressiveness degradation. The learnable shift in GraphNorm enables the model to learn to avoid such degradation for those cases. Empirically, Graph neural networks (GNNs) with GraphNorm converge much faster compared to GNNs with other normalization methods, e.g., BatchNorm.  GraphNorm also improves generalization of GNNs, achieving better performance on graph classification benchmarks.",
+    "title": "GraphNorm: A Principled Approach to Accelerating Graph Neural Network Training",
+    "authors": [
+      "Tianle Cai",
+      "Shengjie Luo",
+      "Keyulu Xu",
+      "Di He",
+      "Tie-Yan Liu",
+      "Liwei Wang"
+    ],
+    "emails": [
+      "~Keyulu_Xu1",
+      "~Shengjie_Luo1",
+      "~Tie-Yan_Liu1",
+      "~Liwei_Wang1",
+      "~Di_He1",
+      "~Tianle_Cai1"
+    ],
+    "rank": 2110
+  },
+  {
+    "url": "https://openreview.net/forum?id=uDN8pRAdsoC",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph Neural Networks (GNNs) are a flexible and powerful family of models that build nodes' representations on irregular graph-structured data. This paper focuses on explaining or interpreting the rationale underlying a given prediction of already trained graph neural networks for the node classification task. Existing approaches for interpreting GNNs try to find subsets of important features and nodes by learning a continuous mask. Our objective is to find discrete masks that are arguably more interpretable while minimizing the expected deviation from the underlying model's prediction. We empirically show that our explanations are both more predictive and sparse. Additionally, we find that multiple diverse explanations are possible, which sufficiently explain a prediction. Finally, we analyze the explanations to find the effect of network homophily on the decision-making process of GNNs.",
+    "title": "Hard Masking for Explaining Graph Neural Networks",
+    "authors": [
+      "Thorben Funke",
+      "Megha Khosla",
+      "Avishek Anand"
+    ],
+    "emails": [
+      "~Avishek_Anand1",
+      "~Thorben_Funke1",
+      "~Megha_Khosla1"
+    ],
+    "rank": 2111
+  },
+  {
+    "url": "https://openreview.net/forum?id=kQ7z7PkTXi",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "In this study we present a deep neural network-based online multi-speaker localisation algorithm based on a multi-microphone array. A fully convolutional network is trained with instantaneous spatial features to estimate the direction of arrival for each time-frequency  bin. The high resolution classification enables the network to accurately and simultaneously localize and track multiple speakers, both static and dynamic. Elaborated experimental study using simulated and real-life recordings in static and dynamic scenarios, demonstrates that the proposed algorithm significantly outperforms both classic and recent deep-learning-based algorithms.",
+    "title": "Dynamically locating multiple speakers based on the time-frequency domain",
+    "authors": [
+      "Hodaya Hammer",
+      "Shlomo Chazan",
+      "Jacob Goldberger",
+      "Sharon Gannot"
+    ],
+    "emails": [
+      "~Shlomo_Chazan1",
+      "~Sharon_Gannot1",
+      "gmail.com",
+      "~Jacob_Goldberger1"
+    ],
+    "rank": 2112
+  },
+  {
+    "url": "https://openreview.net/forum?id=oh71uL93yay",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      6
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Label Propagation (LPA) and Graph Convolutional Neural Networks (GCN) are both message passing algorithms on graphs. Both solve the task of node classification but LPA propagates node label information across the edges of the graph, while GCN propagates and transforms node feature information. However, while conceptually similar, it is unclear how LPA and GCN can be combined under a unified framework to improve node classification. Here we study the relationship between LPA and GCN in terms of feature/label influence, in which we characterize how much the initial feature/label of one node influences the final feature/label of another node in GCN/LPA. Based on our theoretical analysis, we propose an end-to-end model that combines GCN and LPA. In our unified model, edge weights are learnable, and the LPA serves as regularization to assist the GCN in learning proper edge weights that lead to improved classification performance. Our model can also be seen as learning the weights for edges based on node labels, which is more task-oriented than existing feature-based attention models and topology-based diffusion models. In a number of experiments on real-world graphs, our model shows superiority over state-of-the-art graph neural networks in terms of node classification accuracy.",
+    "title": "Unifying Graph Convolutional Neural Networks and Label Propagation",
+    "authors": [
+      "Hongwei Wang",
+      "Jure Leskovec"
+    ],
+    "emails": [
+      "~Hongwei_Wang1",
+      "~Jure_Leskovec1"
+    ],
+    "rank": 2113
+  },
+  {
+    "url": "https://openreview.net/forum?id=fTeb_adw5y4",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      2,
+      5,
+      7
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Neural networks lack adversarial robustness -- they are vulnerable to adversarial examples that through small perturbations to inputs cause incorrect predictions. Further, trust is undermined when models give miscalibrated uncertainty estimates, i.e.  the predicted probability is not a good indicator of how much we should trust our model.  In this paper, we study the connection between adversarial robustness and calibration on four classification networks and datasets. We find that the inputs for which the model is sensitive to small perturbations (are easily attacked) are more likely to have poorly calibrated predictions. Based on this insight, we examine if calibration can be improved by addressing those adversarially unrobust inputs. To this end, we propose Adversarial Robustness based Adaptive Label Smoothing (AR-AdaLS) that integrates the correlations of adversarial robustness and uncertainty into training by adaptively softening labels for an example based on how easily it can be attacked by an adversary. We find that our method, taking the adversarial robustness of the in-distribution data into consideration, leads to better calibration over the model even under distributional shifts. In addition, AR-AdaLS can also be applied to an ensemble model to further improve model's calibration.",
+    "title": "Improving Calibration through the Relationship with Adversarial Robustness",
+    "authors": [
+      "Yao Qin",
+      "Xuezhi Wang",
+      "Alex Beutel",
+      "Ed Chi"
+    ],
+    "emails": [
+      "~Ed_Chi1",
+      "~Alex_Beutel1",
+      "~Xuezhi_Wang3",
+      "~Yao_Qin1"
+    ],
+    "rank": 2114
+  },
+  {
+    "url": "https://openreview.net/forum?id=PEcNk5Bad7z",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "Recent work has constructed neural networks that are equivariant to continuous symmetry groups such as 2D and 3D rotations. This is accomplished using explicit group representations to derive the equivariant kernels and nonlinearities. We present two contributions motivated by frontier applications of equivariance beyond rotations and translations. First, we relax the requirement for explicit Lie group representations, presenting a novel algorithm that finds irreducible representations of noncommutative Lie groups given only the structure constants of the associated Lie algebra. Second, we demonstrate that Lorentz-equivariance is a useful prior for object-tracking tasks and construct the first object-tracking model equivariant to the Poincar\u00e9 group.",
+    "title": "Learning Irreducible Representations of Noncommutative Lie Groups",
+    "authors": [
+      "Noah Shutty",
+      "Casimir Wierzynski"
+    ],
+    "emails": [
+      "intel.com",
+      "~Noah_Shutty1"
+    ],
+    "rank": 2115
+  },
+  {
+    "url": "https://openreview.net/forum?id=6M4c3WegNtX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Ensembles of neural networks achieve superior performance compared to stand-alone networks not only in terms of predictive performance, but also uncertainty calibration and robustness to dataset shift. Diversity among networks is believed to be key for building strong ensembles, but typical approaches, such as \\emph{deep ensembles}, only ensemble different weight vectors of a fixed architecture. Instead, we propose two methods for constructing ensembles to exploit diversity among networks with \\emph{varying} architectures. We find that the resulting ensembles are indeed more diverse and also exhibit better uncertainty calibration, predictive performance and robustness to dataset shift in comparison with deep ensembles on a variety of classification tasks.",
+    "title": "Neural Ensemble Search for Uncertainty Estimation and Dataset Shift",
+    "authors": [
+      "Sheheryar Zaidi",
+      "Arber Zela",
+      "Thomas Elsken",
+      "Chris Holmes",
+      "Frank Hutter",
+      "Yee Whye Teh"
+    ],
+    "emails": [
+      "~Thomas_Elsken1",
+      "~Frank_Hutter1",
+      "~Sheheryar_Zaidi1",
+      "~Arber_Zela1",
+      "~Yee_Whye_Teh1",
+      "stats.ox.ac.uk"
+    ],
+    "rank": 2116
+  },
+  {
+    "url": "https://openreview.net/forum?id=1Q-CqRjUzf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      2,
+      3
+    ],
+    "abstract": "Standard training techniques for neural networks involve multiple sources of randomness, e.g., initialization, mini-batch ordering and in some cases data augmentation. Given that neural networks are heavily over-parameterized in practice, such randomness can cause {\\em churn} -- disagreements between predictions of the two models independently trained by the same algorithm, contributing to the `reproducibility challenges' in modern machine learning. In this paper, we study this problem of churn, identify factors that cause it, and propose two simple means of mitigating it. We first demonstrate that churn is indeed an issue, even for standard image classification tasks (CIFAR and ImageNet), and study the role of the different sources of training randomness that cause churn. By analyzing the relationship between churn and prediction confidences, we pursue an approach with two components for churn reduction. First, we propose using \\emph{minimum entropy regularizers} to increase prediction confidences. Second, we present a  novel variant of co-distillation approach~\\citep{anil2018large} to increase model agreement and reduce churn. We present empirical results showing the effectiveness of both techniques in reducing churn while improving the accuracy of the underlying model.",
+    "title": "On the Reproducibility of Neural Network Predictions",
+    "authors": [
+      "Srinadh Bhojanapalli",
+      "Kimberly Jenney Wilber",
+      "Andreas Veit",
+      "Ankit Singh Rawat",
+      "Seungyeon Kim",
+      "Aditya Krishna Menon",
+      "Sanjiv Kumar"
+    ],
+    "emails": [
+      "~Ankit_Singh_Rawat1",
+      "~Kimberly_Jenney_Wilber1",
+      "~Andreas_Veit1",
+      "~Seungyeon_Kim1",
+      "~Srinadh_Bhojanapalli1",
+      "~Sanjiv_Kumar1",
+      "~Aditya_Krishna_Menon1"
+    ],
+    "rank": 2117
+  },
+  {
+    "url": "https://openreview.net/forum?id=8IbZUle6ieH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      2,
+      1,
+      3
+    ],
+    "abstract": "Graph Neural Networks (GNNs) have become the de facto method for machine learning on graph data (e.g., social networks, protein structures, code ASTs), but they require significant time and resource to train. One alternative method is Graph Neural Tangent Kernel (GNTK), a kernel method that corresponds to infinitely wide multi-layer GNNs. GNTK's parameters can be solved directly in a single step, avoiding time-consuming gradient descent. Today, GNTK is the state-of-the-art method to achieve high training speed without compromising accuracy. Unfortunately, solving for the kernel and searching for parameters can still take hours to days on real-world graphs. The current computation of GNTK has running time <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mn>4</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>N</mi></math></mjx-assistive-mml></mjx-container> is the number of nodes in the graph. This prevents GNTK from scaling to datasets that contain large graphs. Theoretically, we present two techniques to speed up GNTK training while preserving the generalization error: (1) We use a novel matrix decoupling method to reduce matrix dimensions during the kernel solving. This allows us to reduce the dominated computation bottleneck term from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mn>4</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mn>3</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. (2)  We apply sketching to further reduce the bottleneck term to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.054em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D714 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>o</mi><mo stretchy=\"false\">(</mo><msup><mi>N</mi><mrow><mi>\u03c9</mi></mrow></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D714 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2248\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03c9</mi><mo>\u2248</mo><mn>2.373</mn></math></mjx-assistive-mml></mjx-container> is the exponent of current matrix multiplication. Experimentally, we demonstrate that our approaches speed up kernel learning by up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>19</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> on real-world benchmark datasets.",
+    "title": "Graph Neural Network Acceleration via Matrix Dimension Reduction",
+    "authors": [
+      "Shunhua Jiang",
+      "Yunze Man",
+      "Zhao Song",
+      "Danyang Zhuo"
+    ],
+    "emails": [
+      "~Yunze_Man2",
+      "~Danyang_Zhuo1",
+      "~Shunhua_Jiang1",
+      "~Zhao_Song3"
+    ],
+    "rank": 2118
+  },
+  {
+    "url": "https://openreview.net/forum?id=ARaF-70QBJ1",
+    "ratings": [
+      3,
+      5,
+      5,
+      6
+    ],
+    "rating": "4.67",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "This work introduces pyramidal convolution (PyConv), which is capable of processing the input at multiple filter scales. PyConv contains a pyramid of kernels, where each level involves different types of filters with varying size and depth, which are able to capture different levels of details in the scene. On top of these improved recognition capabilities, PyConv is also efficient and, with our formulation, it does not increase the  computational cost and  parameters compared to standard convolution. Moreover, it is very flexible and extensible, providing a large space of potential network architectures for different applications. PyConv has the potential to impact nearly every computer vision task and,  in this work, we present different architectures based on PyConv for four main tasks on visual recognition: image classification, video action classification/recognition, object detection and semantic image segmentation/parsing. Our approach shows  significant improvements over all these core tasks in comparison with the baselines. For instance, on image recognition, our 50-layers network outperforms in terms of recognition performance on ImageNet dataset its counterpart baseline ResNet with 152 layers, while having 2.39 times less parameters, 2.52 times lower computational complexity and more than 3 times less layers. On image segmentation, our novel framework sets a new state-of-the-art on the challenging ADE20K benchmark for scene parsing. We will make the code and models publicly available.",
+    "title": "Pyramidal Convolution: Rethinking Convolutional Neural Networks for Visual Recognition",
+    "authors": [
+      "Ionut Cosmin Duta",
+      "Li Liu",
+      "Fan Zhu",
+      "Ling Shao"
+    ],
+    "emails": [
+      "~Fan_Zhu5",
+      "~Ling_Shao1",
+      "~Li_Liu12",
+      "~Ionut_Cosmin_Duta2"
+    ],
+    "rank": 2119
+  },
+  {
+    "url": "https://openreview.net/forum?id=3GYfIYvNNhL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Humans are accustomed to environments that contain both regularities and exceptions. For example, at most gas stations, one pays prior to pumping, but the occasional rural station does not accept payment in advance.\nLikewise, deep neural networks can generalize across instances that share common patterns or structures yet have the capacity to memorize rare or irregular forms. We analyze how individual instances are treated by a model via  a consistency score. The score characterizes the expected accuracy for a held-out instance given training sets of varying size sampled from the data distribution. We obtain empirical estimates of this score for individual instances in multiple data-sets, and we show that the score identifies out-of-distribution and mislabeled examples at one end of the continuum and strongly regular examples at the other end. We identify computationally inexpensive proxies to the consistency score using statistics collected during training. We apply the score toward understanding the dynamics of representation learning and to filter outliers during training, and we discuss other potential applications including curriculum learning and active data collection.",
+    "title": "Characterizing Structural Regularities of Labeled Data in Overparameterized Models",
+    "authors": [
+      "Ziheng Jiang",
+      "Chiyuan Zhang",
+      "Kunal Talwar",
+      "Michael Curtis Mozer"
+    ],
+    "emails": [
+      "~Kunal_Talwar1",
+      "~Ziheng_Jiang1",
+      "~Michael_Curtis_Mozer1",
+      "~Chiyuan_Zhang1"
+    ],
+    "rank": 2120
+  },
+  {
+    "url": "https://openreview.net/forum?id=sp3Z1jiS2vn",
+    "ratings": [
+      3,
+      6,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this work, we analyze linear operators from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>L</mi><mn>2</mn></msup><mo stretchy=\"false\">(</mo><msup><mi>S</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">\u2192</mo><msup><mi>L</mi><mn>2</mn></msup><mo stretchy=\"false\">(</mo><msup><mi>S</mi><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> which are equivariant to azimuthal rotations, that is, rotations around the z-axis. Several high-performing neural networks defined on the sphere are indeed equivariant to azimuthal rotations, but not <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>S</mi><mi>O</mi><mo stretchy=\"false\">(</mo><mn>3</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. Our main result is to show that a linear operator acting on band-limited functions on the sphere is equivariant to azimuthal rotations if and only if it can be realized as a block-diagonal matrix acting on the spherical harmonic expansion coefficients of its input. Further, we show that such an operation can be interpreted as a convolution or equivalently a correlation. Our theoretical findings are backed up with experimental results demonstrating that a state-of-the-art pipeline can be improved by making it fully equivariant to azimuthal rotations.",
+    "title": "Azimuthal Rotational Equivariance in Spherical CNNs",
+    "authors": [
+      "Carl Toft",
+      "Georg B\u00f6kman",
+      "Fredrik Kahl"
+    ],
+    "emails": [
+      "chalmers.se",
+      "~Carl_Toft1",
+      "~Fredrik_Kahl3"
+    ],
+    "rank": 2121
+  },
+  {
+    "url": "https://openreview.net/forum?id=4CxsUBDQJqv",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Learning effective policies for sparse objectives is a key challenge in Deep Reinforcement Learning (RL). A common approach is to design task-related dense rewards to improve task learnability. While such rewards are easily interpreted, they rely on heuristics and domain expertise. Alternate approaches that train neural networks to discover dense surrogate rewards avoid heuristics, but are high-dimensional, black-box solutions offering little interpretability. In this paper, we present a method that discovers dense rewards in the form of low-dimensional symbolic trees - thus making them more tractable for analysis. The trees use simple functional operators to map an agent's observations to a scalar reward, which then supervises the policy gradient learning of a neural network policy. We test our method on continuous action spaces in Mujoco and discrete action spaces in Atari and Pygames environments. We show that the discovered dense rewards are an effective signal for an RL policy to solve the benchmark tasks. Notably, we significantly outperform a widely used, contemporary neural-network based reward-discovery algorithm in all environments considered.",
+    "title": "Learning Intrinsic Symbolic Rewards in Reinforcement Learning",
+    "authors": [
+      "Hassam Sheikh",
+      "Shauharda Khadka",
+      "Santiago Miret",
+      "Somdeb Majumdar"
+    ],
+    "emails": [
+      "~Santiago_Miret1",
+      "~Shauharda_Khadka1",
+      "~Somdeb_Majumdar1",
+      "~Hassam_Sheikh1"
+    ],
+    "rank": 2122
+  },
+  {
+    "url": "https://openreview.net/forum?id=7ODIasgLJlU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We initiate the study on deep reinforcement learning problems that require low switching cost, i.e., small number of policy switches during training.  Such a requirement is ubiquitous in many applications, such as medical domains, recommendation systems, education, robotics, dialogue agents, etc, where the deployed policy that actually interacts with the environment cannot change frequently. Our paper investigates different policy switching criteria based on deep Q-networks and further proposes an adaptive approach based on the feature distance between the  deployed  Q-network  and  the  underlying  learning  Q-network.   Through  extensive experiments on a medical treatment environment and a collection of the Atari games, we find our feature-switching criterion substantially decreases the switching cost while maintains a similar sample efficiency to the case without the low-switching-cost constraint.  We also complement this empirical finding with a theoretical justification from a representation learning perspective.",
+    "title": "Deep Q-Learning with Low Switching Cost",
+    "authors": [
+      "Shusheng Xu",
+      "Simon Shaolei Du",
+      "Yi Wu"
+    ],
+    "emails": [
+      "~Shusheng_Xu1",
+      "~Simon_Shaolei_Du1",
+      "~Yi_Wu1"
+    ],
+    "rank": 2123
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q2iaAc-4I1v",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      6,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Humans show an innate ability to learn the regularities of the world through interaction. By performing experiments in our environment, we are able to discern the causal factors of variation and infer how they affect the dynamics of our world. Analogously, here we attempt to equip reinforcement learning agents with the ability to perform experiments that facilitate a categorization of the rolled-out trajectories, and to subsequently infer the causal factors of the environment in a hierarchical manner. We introduce a novel intrinsic reward, called causal curiosity, and show that it allows our agents to learn optimal sequences of actions, and to discover causal factors in the dynamics. The learned behavior allows the agent to infer a binary quantized representation for the ground-truth causal factors in every environment. Additionally, we find that these experimental behaviors are semantically meaningful (e.g., to differentiate between heavy and light blocks, our agents learn to lift them), and are learnt in a self-supervised manner with approximately 2.5 times less data than conventional supervised planners. We show that these behaviors can be re-purposed and fine-tuned (e.g., from lifting to pushing or other downstream tasks). Finally, we show that the knowledge of causal factor representations aids zero-shot learning for more complex tasks.",
+    "title": "Causal Curiosity: RL Agents Discovering Self-supervised Experiments for Causal Representation Learning",
+    "authors": [
+      "Sumedh Anand Sontakke",
+      "Arash Mehrjou",
+      "Theofanis Karaletsos",
+      "Laurent Itti",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Arash_Mehrjou1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Theofanis_Karaletsos1",
+      "~Laurent_Itti2",
+      "~Sumedh_Anand_Sontakke1"
+    ],
+    "rank": 2124
+  },
+  {
+    "url": "https://openreview.net/forum?id=u_bGm5lrm72",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Bio-inspired spiking neural networks (SNNs), operating with asynchronous binary signals (or spikes) distributed over time, can potentially lead to greater computational efficiency on event-driven hardware. The state-of-the-art SNNs suffer from high inference latency, resulting from inefficient input encoding, and sub-optimal settings of the neuron parameters (firing threshold, and membrane leak). We propose DIET-SNN, a low latency deep spiking network that is trained with gradient descent to optimize the membrane leak and the firing threshold along with other network parameters (weights). The membrane leak and threshold for each layer of the SNN are optimized with end-to-end backpropagation to achieve competitive accuracy at reduced latency. The analog pixel values of an image are directly applied to the input layer of DIET-SNN without the need to convert to spike-train. The first convolutional layer is trained to convert inputs into spikes where leaky-integrate-and-fire (LIF) neurons integrate the weighted inputs and generate an output spike when the membrane potential crosses the trained firing threshold. The trained membrane leak controls the flow of input information and attenuates irrelevant inputs to increase the activation sparsity in the convolutional and linear layers of the network. The reduced latency combined with high activation sparsity provides large improvements in computational efficiency. We evaluate DIET-SNN on image classification tasks from CIFAR and ImageNet datasets on VGG and ResNet architectures. We achieve top-1 accuracy of 69% with 5 timesteps (inference latency) on the ImageNet dataset with 12x less compute energy than an equivalent standard ANN. Additionally, DIET-SNN performs 20-500x faster inference compared to other state-of-the-art SNN models.",
+    "title": "DIET-SNN: A Low-Latency Spiking Neural Network with Direct Input Encoding & Leakage and Threshold Optimization",
+    "authors": [
+      "Nitin Rathi",
+      "Kaushik Roy"
+    ],
+    "emails": [
+      "~Kaushik_Roy1",
+      "~Nitin_Rathi1"
+    ],
+    "rank": 2125
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y4SOA2qsYJS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Hyperspectral anomaly detection under high-dimensional data and interference of deteriorated bands without any prior information has been challenging and attracted close attention in the exploration of the unknown in real scenarios. However, some emerging methods based on generative adversarial network (GAN) suffer from the problems of gradient vanishing and training instability with struggling to strike a balance between performance and training sample limitations. In this work, aiming to remedy the drawbacks of existing methods, we present a novel multi-scale covariance map (MCM)-aware twin-least-square GAN (MTGAN). Instead of the widely used single-scale Gaussian hypothesis background estimation, in MTGAN, we introduce the MCM-aware strategy to construct multi-scale priors with precise second-order statistics, thereby implicitly bridging the spatial and spectral information. Thus, we reliably and adaptively represent the prior of HSI to change the priors-lack situation. Moreover, we impose the twin-least-square loss on GAN, which helps improve the generative ability and training stability in feature and image domains, overcoming the gradient vanishing problem. Finally, the network enforced with a new anomaly rejection loss establishes a pure and discriminative background estimation. Experiments demonstrate that the average detection accuracy of MTGAN reaches 0.99809, which is superior to the state-of-the-art algorithms.",
+    "title": "MCM-aware Twin-least-square GAN for Hyperspectral Anomaly Detection",
+    "authors": [
+      "Jiaping Zhong",
+      "Weiying Xie",
+      "Jie Lei",
+      "Yunsong Li",
+      "Zan Li"
+    ],
+    "emails": [
+      "~Yunsong_Li1",
+      "stu.xidian.edu.cn",
+      "xidian.edu.cn",
+      "mail.xidian.edu.cn",
+      "~Weiying_Xie1"
+    ],
+    "rank": 2126
+  },
+  {
+    "url": "https://openreview.net/forum?id=jHykXSIk3ch",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Batch Normalization (BN) is a prominent deep learning technique. In spite of its apparent simplicity, its implications  over  optimization  are  yet  to  be  fully  understood. While previous studies mostly focus on the interaction between BN and stochastic gradient descent (SGD), we develop a geometric perspective which allows us to precisely characterize the relation between BN and Adam. More precisely, we leverage the radial invariance of groups of parameters, such as filters for convolutional neural networks, to translate the optimization steps on the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> unit hypersphere. This formulation and the associated geometric interpretation shed new light on the training dynamics. Firstly, we use it to derive the first effective learning rate expression of Adam. Then we show that, in the presence of BN layers, performing SGD alone is actually equivalent to a variant of Adam constrained to the unit hypersphere. Finally, our analysis outlines phenomena that previous variants of Adam act on and we experimentally validate their importance in the optimization process.",
+    "title": "A spherical analysis of Adam with Batch Normalization",
+    "authors": [
+      "Simon Roburin",
+      "Yann Dubois de Mont-Marin",
+      "Andrei Bursuc",
+      "Renaud Marlet",
+      "Patrick Perez",
+      "Mathieu Aubry"
+    ],
+    "emails": [
+      "~Andrei_Bursuc1",
+      "~Patrick_Perez1",
+      "~Renaud_Marlet1",
+      "~Simon_Roburin1",
+      "~Mathieu_Aubry3",
+      "~Yann_Dubois_de_Mont-Marin1"
+    ],
+    "rank": 2127
+  },
+  {
+    "url": "https://openreview.net/forum?id=NLuOUSp9zZd",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      6
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we propose a new approach to train Generative Adversarial Networks (GAN) where we deploy a double-oracle framework using the generator and discriminator oracles. GAN is essentially a two-player zero-sum game between the generator and the discriminator. Training GANs is challenging as a pure Nash equilibrium may not exist and even finding the mixed Nash equilibrium is difficult as GANs have a large-scale strategy space. In DO-GAN, we extend the double oracle framework to GANs. We first generalize the player strategies as the trained models of generator and discriminator from the best response oracles. We then compute the meta-strategies using a linear program. Next, we prune the weakly-dominated player strategies to keep the oracles from becoming intractable. We apply our framework to established architectures such as vanilla GAN, Deep Convolutional GAN, Spectral Normalization GAN and Stacked GAN. Finally, we conduct evaluations on MNIST, CIFAR-10 and CelebA datasets and show that DO-GAN variants have significant improvements in both subjective qualitative evaluation and quantitative metrics, compared with their respective GAN architectures.",
+    "title": "DO-GAN: A Double Oracle Framework for Generative Adversarial Networks",
+    "authors": [
+      "Aye Phyu Phyu Aung",
+      "Xinrun Wang",
+      "Runsheng Yu",
+      "Bo An",
+      "Senthilnath Jayavelu",
+      "Xiaoli Li"
+    ],
+    "emails": [
+      "~Xiaoli_Li1",
+      "~Bo_An2",
+      "i2r.a-star.edu.sg",
+      "~Xinrun_Wang1",
+      "~Runsheng_Yu2",
+      "~Aye_Phyu_Phyu_Aung1"
+    ],
+    "rank": 2128
+  },
+  {
+    "url": "https://openreview.net/forum?id=nySHNUlKTVw",
+    "ratings": [
+      3,
+      7,
+      5,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Graphs have attracted numerous attention in varied areas and are dynamic in many scenarios. Among dynamic graphs, growing graphs with frequently expanding vertex and edge sets are typical and widely existed, e.g. the rapidly growing social networks. Confronting such growing data, existing methods on either static or dynamic graphs take the entire graph as a whole and may suffer from high computation cost and memory usage due to the continual growth of graphs. To tackle this problem, we introduce incremental graph learning (IGL), a general framework to formulate the learning on growing graphs in an incremental manner, where traditional graph learning method could be deployed as a basic model. We first analyze the problems of directly finetuning on the incremental part of graph, and theoretically discuss the unbiased and edge-preserved conditions of IGL. In our method, when the graph grows with new-coming data, we select or generate vertices and edges within restricted sizes from the previous graph to update current model together with the new data. Here, two strategies, i.e. sample-based and cluster-based, are proposed for learning with restricted time and space complexity. We conduct experiments on the node classification and link prediction tasks of multiple datasets. Experimental results and comparisons show that our method achieves satisfying performance with high efficiency on growing graphs.",
+    "title": "Incremental Learning on Growing Graphs",
+    "authors": [
+      "Yutong Feng",
+      "Jianwen Jiang",
+      "Yue Gao"
+    ],
+    "emails": [
+      "~Jianwen_Jiang2",
+      "~Yue_Gao4",
+      "~Yutong_Feng2"
+    ],
+    "rank": 2129
+  },
+  {
+    "url": "https://openreview.net/forum?id=Hpxrls8yAn",
+    "ratings": [
+      5,
+      4,
+      7,
+      3
+    ],
+    "rating": "4.67",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The latent dynamics model summarizes an agent\u2019s high dimensional experiences in a compact way. While learning from imagined trajectories by the latent model is confirmed to has great potential to facilitate behavior learning, the lack of memory diversity limits generalization capability. Inspired by a neuroscience experiment of \u201cforming artificial memories during sleep\u201d, we propose a robust memory augmentation method with Constrained Latent ImaginatiON (CLION) under a novel actor-critic framework, which aims to speed up the learning of the optimal policy with virtual episodic. Various experiments on high-dimensional visual control tasks with arbitrary image uncertainty demonstrate that CLION outperforms existing approaches in terms of data-efficiency, robustness to uncertainty, and final performance.",
+    "title": "Robust Memory Augmentation by Constrained Latent Imagination",
+    "authors": [
+      "Yao Mu",
+      "Yuzheng Zhuang",
+      "Bin Wang",
+      "Wulong Liu",
+      "Shengbo Eben Li",
+      "Jianye HAO"
+    ],
+    "emails": [
+      "~Bin_Wang12",
+      "~Yao_Mu1",
+      "~Wulong_Liu1",
+      "~Yuzheng_Zhuang1",
+      "~Jianye_HAO1",
+      "~Shengbo_Eben_Li1"
+    ],
+    "rank": 2130
+  },
+  {
+    "url": "https://openreview.net/forum?id=PP4KyAaBoBK",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      6,
+      3,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Computer vision technology is widely used in biological and medical data analysis and understanding. However, there are still two major bottlenecks in the field of cell membrane segmentation, which seriously hinder further research: lack of sufficient high-quality data and lack of suitable evaluation criteria. In order to solve these two problems, this paper first introduces an Ultra-high Resolution Image Segmentation dataset for the Cell membrane, called U-RISC, the largest annotated EM dataset for the Cell membrane with multiple iterative annotations and uncompressed high-resolution raw data. During the analysis process of the U-RISC, we found that the current popular segmentation evaluation criteria are inconsistent with human perception. This interesting phenomenon is confirmed by a subjective experiment involving twenty people. Furthermore, to resolve this inconsistency, we propose a Perceptual Hausdorff Distance (PHD) evaluation criterion to measure the quality of cell membrane segmentation results. Detailed performance comparison and discussion of classic segmentation methods along with two iterative manual annotation results under existing criteria and PHD is given.",
+    "title": "Human Perception-based Evaluation Criterion for Ultra-high Resolution Cell Membrane Segmentation",
+    "authors": [
+      "Ruohua Shi",
+      "Wenyao Wang",
+      "Zhixuan Li",
+      "Liuyuan He",
+      "Kaiwen Sheng",
+      "Lei Ma",
+      "Kai Du",
+      "Tingting Jiang",
+      "Tiejun Huang"
+    ],
+    "emails": [
+      "~Liuyuan_He1",
+      "~Lei_Ma3",
+      "~Zhixuan_Li1",
+      "~Kaiwen_Sheng1",
+      "~Ruohua_Shi1",
+      "~Tiejun_Huang1",
+      "~Kai_Du1",
+      "~Tingting_Jiang2",
+      "~Wenyao_Wang1"
+    ],
+    "rank": 2131
+  },
+  {
+    "url": "https://openreview.net/forum?id=Efiwpsy0ZE_",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Graph Reasoning has shown great potential recently in modeling long-range dependencies, which are crucial for various computer vision tasks. However, the graph representation learned by existing methods is not effective enough as the relation between feature and graph is under-explored. In this work, we propose a novel method named Contextual Graph Reasoning (CGR) that learns a context-aware relation between feature and graph. This is achieved by constructing the projection matrix based on a global set of descriptors during graph projection, and calibrating the evolved graph based on the self-attention of all nodes during graph reprojection. Therefore, contextual information is well explored in both graph projection and reprojection with our method. To verify the effectiveness of our method, we conduct extensive experiments on semantic segmentation, instance segmentation, and 2D human pose estimation. Our method consistently achieves remarkable improvements over state-of-the-art methods, demonstrating the effectiveness and generalization ability of our method.",
+    "title": "Contextual Graph Reasoning Networks",
+    "authors": [
+      "Zhaoqing Wang",
+      "Jiaming Liu",
+      "Yangyuxuan Kang",
+      "Mingming Gong",
+      "Chuang Zhang",
+      "Ming Lu",
+      "Ming Wu"
+    ],
+    "emails": [
+      "~Mingming_Gong1",
+      "~Yangyuxuan_Kang1",
+      "~Jiaming_Liu2",
+      "~Zhaoqing_Wang1",
+      "~Chuang_Zhang1",
+      "~Ming_Wu2",
+      "~Ming_Lu2"
+    ],
+    "rank": 2132
+  },
+  {
+    "url": "https://openreview.net/forum?id=XMoyS8zm6GA",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep neural network classifiers naturally partition input space into regions belonging to different classes. The geometry of these class manifolds (CMs) is widely studied and is intimately related to model performance; for example, the margin is defined via boundaries between these CMs. We present a simple technique to estimate the effective dimension of CMs as well as boundaries between multiple CMs, by computing their intersection with random affine subspaces of varying dimension. We provide a theory for the technique and verify that our theoretical predictions agree with measurements on real neural networks. Through extensive experiments, we leverage this method to show deep connections between the geometry of CMs, generalization, and robustness. In particular we investigate how CM dimension depends on 1) the dataset, 2) architecture, 3) random initialization, 4) stage of training, 5) class, 6) ensemble size, 7) label randomization, 8) training set size, and 9) model robustness to data corruption. Together a picture emerges that well-performing, robust models have higher dimensional CMs than worse performing models. Moreover, we offer a unique perspective on ensembling via intersections of CMs. Our core code is available on Github (link in the PDF abstract).",
+    "title": "Slice, Dice, and Optimize: Measuring the Dimension of Neural Network Class Manifolds",
+    "authors": [
+      "Stanislav Fort",
+      "Ekin Dogus Cubuk",
+      "Surya Ganguli",
+      "Samuel Stern Schoenholz"
+    ],
+    "emails": [
+      "~Samuel_Stern_Schoenholz1",
+      "~Surya_Ganguli1",
+      "~Stanislav_Fort1",
+      "~Ekin_Dogus_Cubuk1"
+    ],
+    "rank": 2133
+  },
+  {
+    "url": "https://openreview.net/forum?id=unRf7cz1o1",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Model inversion (MI) attacks in the whitebox setting are aimed at reconstructing training data from model parameters. Such attacks have triggered increasing concerns about privacy, especially given a growing number of online model repositories. However, existing MI attacks against deep neural networks (DNNs) have large room for performance improvement. A natural question is whether the underperformance is because the target model does not memorize much about its training data or it is simply an artifact of imperfect attack algorithm design? This paper shows that it is the latter. We present a variety of new techniques that can significantly boost the performance of MI attacks against DNNs. Recent advances to attack DNNs are largely attributed to the idea of training a general generative adversarial network (GAN) with potential public data and using it to regularize the search space for reconstructed images. We propose to customize the training of a GAN to the inversion task so as to better distill knowledge useful for performing attacks from public data. Moreover, unlike previous work that directly searches for a single data point to represent a target class, we propose to model private data distribution in order to better reconstruct representative data points. Our experiments show that the combination of these techniques can lead to state-of-the-art attack performance on a variety of datasets and models, even when the public data has a large distributional shift from the private data.",
+    "title": "Improved Techniques for Model Inversion Attacks",
+    "authors": [
+      "Si Chen",
+      "Ruoxi Jia",
+      "Guo-Jun Qi"
+    ],
+    "emails": [
+      "~Ruoxi_Jia1",
+      "~Si_Chen5",
+      "~Guo-Jun_Qi1"
+    ],
+    "rank": 2134
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZCix3kmMo4Q",
+    "ratings": [
+      7,
+      4,
+      3
+    ],
+    "rating": "4.67",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "In multi-object detection using neural networks, most methods train a network based on ground truth assignment, which makes the training too heuristic and complicated. In this paper, we reformulate the multi-object detection task as a problem of density estimation of bounding boxes. Instead of using a ground-truth-assignment-based method, we train a network by estimating the probability density of bounding boxes in an input image using a mixture model. For this purpose, we propose a novel network for object detection called Mixture Density Object Detector (MDOD), and the corresponding objective function for the density-estimation-based training. Unlike the ground-truth-assignment-based methods, our proposed method gets rid of the cumbersome processes of matching between ground truth boxes and their predictions as well as the heuristic anchor design. It is also free from the problem of foreground-background imbalance. We applied MDOD to MS COCO dataset. Our proposed method not only deals with multi-object detection problems in a new approach, but also improves detection performances through MDOD. Code will be available.",
+    "title": "Density-Based Object Detection: Learning Bounding Boxes without Ground Truth Assignment",
+    "authors": [
+      "Jaeyoung Yoo",
+      "Hojun Lee",
+      "Inseop Chung",
+      "Geonseok Seo",
+      "Nojun Kwak"
+    ],
+    "emails": [
+      "~Geonseok_Seo2",
+      "~Nojun_Kwak1",
+      "~Hojun_Lee2",
+      "~Jaeyoung_Yoo2",
+      "~Inseop_Chung1"
+    ],
+    "rank": 2135
+  },
+  {
+    "url": "https://openreview.net/forum?id=D04TGKz5rfF",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.65",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "It is well known that deep neural networks are vulnerable to adversarial examples. We attempt to understand adversarial examples from the perspective of frequency analysis. Several works have empirically shown that the gradient-based adversarial attacks perform differently in the low-frequency and high-frequency part of the input data. But there is still a lack of theoretical justification of these phenomena. In this work, we both theoretically and empirically show that the adversarial perturbations gradually increase the concentration in the low-frequency domain of the spectrum during the training process of the model parameters. And the log-spectrum difference of the adversarial examples and clean image is more concentrated in the high-frequency part than the low-frequency part. We also find out that the ratio of the high-frequency and the low-frequency part in the adversarial perturbation is much larger than that in the corresponding natural image. Inspired by these important theoretical findings, we apply low-pass filter to potential adversarial examples before feeding them to the model. The results show that this preprocessing can significantly improve the robustness of the model.",
+    "title": "A frequency domain analysis of gradient-based adversarial examples",
+    "authors": [
+      "Bochen Lv",
+      "Pu Yang",
+      "Zehao Wang",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Zhanxing_Zhu1",
+      "~Zehao_Wang2",
+      "pku.edu.cn"
+    ],
+    "rank": 2136
+  },
+  {
+    "url": "https://openreview.net/forum?id=LtgEkhLScK3",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      4
+    ],
+    "rating": "4.65",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Deep reinforcement learning (DRL) has successfully solved various problems recently, typically with a unimodal policy representation. However, grasping the decomposable and hierarchical structures within a complex task can be essential for further improving its learning efficiency and performance, which may lead to a multimodal policy or a mixture-of-experts (MOE). To our best knowledge, present DRL algorithms for general utility do not deploy MOE methods as policy function approximators due to the lack of differentiability, or without explicit probabilistic representation. In this work, we propose a differentiable probabilistic mixture-of-experts (PMOE) embedded in the end-to-end training scheme for generic off-policy and on-policy algorithms using stochastic policies, e.g., Soft Actor-Critic (SAC) and Proximal Policy Optimisation (PPO). Experimental results testify the advantageous performance of our method over unimodal polices and three different MOE methods, as well as a method of option frameworks, based on two types of DRL algorithms. We also demonstrate the distinguishable primitives learned with PMOE in different environments.",
+    "title": "Probabilistic Mixture-of-Experts for Efficient Deep Reinforcement Learning",
+    "authors": [
+      "Jie Ren",
+      "Yewen Li",
+      "Zihan Ding",
+      "Wei Pan",
+      "Hao Dong"
+    ],
+    "emails": [
+      "~Wei_Pan2",
+      "~Hao_Dong3",
+      "~Yewen_Li1",
+      "~Zihan_Ding1",
+      "~Jie_Ren4"
+    ],
+    "rank": 2137
+  },
+  {
+    "url": "https://openreview.net/forum?id=dOiHyqVaFkg",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      7,
+      6,
+      5
+    ],
+    "rating": "4.65",
+    "confidences": [
+      3,
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We first pose the Unsupervised Progressive Learning (UPL) problem:  an online representation learning problem in which the learner observes a non-stationary and unlabeled data stream, and identifies a growing number of features that persist over time even though the data is not stored or replayed. To solve the UPL problem we propose the Self-Taught Associative Memory (STAM) architecture. Layered hierarchies of STAM modules learn based on a combination of online clustering, novelty detection, forgetting outliers, and storing only prototypical features rather than specific examples. We evaluate STAM representations using classification and clustering tasks. While there are no existing learning scenarios which are directly comparable to UPL, we compare the STAM architecture with two recent continual learning works; Memory Aware Synapses (MAS), and Gradient Episodic Memories (GEM), which have been modified to be suitable for the UPL setting. ",
+    "title": "Unsupervised Progressive Learning and the STAM Architecture",
+    "authors": [
+      "James Smith",
+      "Cameron Ethan Taylor",
+      "Seth Baer",
+      "Constantine Dovrolis"
+    ],
+    "emails": [
+      "~Cameron_Ethan_Taylor1",
+      "~James_Smith1",
+      "~Seth_Baer1",
+      "~Constantine_Dovrolis1"
+    ],
+    "rank": 2138
+  },
+  {
+    "url": "https://openreview.net/forum?id=QnzSSoqmAvB",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      6,
+      5,
+      7
+    ],
+    "rating": "4.65",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      1
+    ],
+    "abstract": "The MuZero algorithm is known for achieving high-level performance on traditional zero-sum two-player games of perfect information such as chess, Go, and shogi, as well as visual, non-zero sum, single-player environments such as the Atari suite. Despite lacking a perfect simulator and employing a learned model of environmental dynamics,  MuZero produces game-playing agents comparable to its predecessor AlphaZero. However, the current implementation of MuZero is restricted only to deterministic environments. This paper presents Nondeterministic MuZero (NDMZ), an extension of MuZero for nondeterministic, two-player, zero-sum games of perfect information. Borrowing from Nondeterministic Monte Carlo Tree Search and the theory of extensive-form games, NDMZ formalizes chance as a player in the game and incorporates it into the MuZero network architecture and tree search. Experiments show that NDMZ is capable of learning effective strategies and an accurate model of the game.",
+    "title": "Playing Nondeterministic Games through Planning with a Learned Model",
+    "authors": [
+      "Thomas Willkens",
+      "Jordan Pollack"
+    ],
+    "emails": [
+      "~Jordan_Pollack1",
+      "~Thomas_Willkens1"
+    ],
+    "rank": 2139
+  },
+  {
+    "url": "https://openreview.net/forum?id=v-9E8egy_i",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      2
+    ],
+    "rating": "4.65",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Relational Graph Neural Networks (GNN) are a class of GNN that are capable of handling multi-relational graphs. Like all GNNs, they suffer from a drop in performance when training deeper networks, which may be caused by vanishing gradients, over-parameterization, and oversmoothing. Previous works have investigated methods that improve the training of deeper GNNs, which include normalization techniques and various types of skip connection within a node. However, learning long-range patterns in multi-relational graphs using GNNs remains an under-explored topic. In this work, we propose a novel GNN architecture based on the Graph Attention Network (GAT) that uses gated skip connections to improve long-range modeling between nodes and uses a more scalable vector-based approach for parameterizing relations. We perform an extensive experimental analysis on synthetic and real data, focusing explicitly on learning long-range patterns. The results indicate that the proposed method significantly outperforms several commonly used relational GNN variants when used in deeper configurations and stays competitive to existing architectures in a shallow setup.",
+    "title": "Gated Relational Graph Attention Networks",
+    "authors": [
+      "Denis Lukovnikov",
+      "Asja Fischer"
+    ],
+    "emails": [
+      "~Denis_Lukovnikov1",
+      "~Asja_Fischer1"
+    ],
+    "rank": 2140
+  },
+  {
+    "url": "https://openreview.net/forum?id=UGTbHjl3S95",
+    "ratings": [
+      5,
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.65",
+    "confidences": [
+      5,
+      3,
+      4,
+      2,
+      3
+    ],
+    "abstract": "Several variants of stochastic gradient descent (SGD) have been proposed to improve the learning effectiveness and efficiency when training deep neural networks, among which some recent influential attempts would like to adaptively control the parameter-wise learning rate (\\emph{e.g.}, Adam and RMSProp). Although they show a large improvement in convergence speed, most adaptive learning rate methods suffer from compromised generalization compared with SGD. In this paper, we proposed an Adaptive Gradient Method with Resilience and Momentum (AdaRem), motivated by the observation that the oscillations of network parameters slow the training, and give a theoretical proof of convergence. For each parameter, AdaRem adjusts the parameter-wise learning rate according to whether the direction of one parameter changes in the past is aligned with the direction of the current gradient, and thus encourages long-term consistent parameter updating with much fewer oscillations. Comprehensive experiments have been conducted to verify the effectiveness of AdaRem when training various models on a large-scale image recognition dataset, \\emph{i.e.}, ImageNet, which also demonstrate that our method outperforms previous adaptive learning rate-based algorithms in terms of the training speed and the test error, respectively.",
+    "title": "Adaptive Gradient Method with Resilience and Momentum",
+    "authors": [
+      "Jie Liu",
+      "Chen Lin",
+      "Chuming Li",
+      "Lu Sheng",
+      "Ming Sun",
+      "Junjie Yan",
+      "Wanli Ouyang"
+    ],
+    "emails": [
+      "~Wanli_Ouyang1",
+      "~Junjie_Yan4",
+      "~Ming_Sun4",
+      "~Lu_Sheng1",
+      "~Jie_Liu13",
+      "~Chuming_Li1",
+      "~Chen_Lin2"
+    ],
+    "rank": 2141
+  },
+  {
+    "url": "https://openreview.net/forum?id=Bw7VC-DJUM",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.65",
+    "confidences": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Current video representations heavily rely on learning from manually annotated video datasets which are time-consuming and expensive to acquire. We observe videos are naturally accompanied by abundant text information such as YouTube titles and Instagram captions. In this paper, we leverage this visual-textual connection to learn spatiotemporal features in an efficient weakly-supervised manner. We present a general cross-modal pair discrimination (CPD) framework to capture this correlation between a video and its associated text. We train our CPD models on both standard video dataset (Kinetics-210k) and uncurated web video dataset (Instagram-300k) to demonstrate its effectiveness. Without further fine-tuning, the learnt models obtain competitive results for action classification on Kinetics under the linear classification protocol. Moreover, our visual model provides an effective initialization to fine-tune on downstream tasks, which yields a remarkable performance gain for action recognition on UCF101 and HMDB51, compared with the existing state-of-the-art self-supervised training methods. In addition, our CPD demonstrates that pre-training on a relatively small dataset is able to yield a comparable performance to those methods of using order magnitude more data, which is meaningful and practicable for the scenarios with limited computational facilities.",
+    "title": "Learning Spatiotemporal Features via Video and Text Pair Discrimination",
+    "authors": [
+      "Tianhao Li",
+      "Limin Wang"
+    ],
+    "emails": [
+      "~Limin_Wang1",
+      "~Tianhao_Li1"
+    ],
+    "rank": 2142
+  },
+  {
+    "url": "https://openreview.net/forum?id=vvSPyGfZre6",
+    "ratings": [
+      4,
+      4,
+      7,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      2,
+      1,
+      4
+    ],
+    "abstract": "Very recently, the Neural Cellular Automata (NCA) has been proposed to simulate the morphogenesis process with deep networks. NCA learns to grow an image starting from a fixed single pixel. \nIn this work, we show that the neural network (NN) architecture of the NCA can be encapsulated in a larger NN. This allows us to propose a new model that encodes a manifold of NCA, each of them capable of generating a distinct image. Therefore, we are effectively learning a representation space of CA, which shows generalization capabilities. We accomplish this by introducing dynamic convolutions inside an Auto-Encoder architecture, for the first time used to join two different sources of information, the encoding and cell's environment information.\nIn biological terms, our approach would play the role of the transcription factors, modulating the mapping of genes into specific proteins that drive cellular differentiation, which occurs right before the morphogenesis.\nWe thoroughly evaluate our approach in a dataset of synthetic emojis and also in real images of CIFAR-10.\nOur model introduces a general-purpose network, which can be used in a broad range of problems beyond image generation. ",
+    "title": "Neural Cellular Automata Manifold",
+    "authors": [
+      "Alejandro Hernandez Ruiz",
+      "Armand Vilalta",
+      "Francesc Moreno-Noguer"
+    ],
+    "emails": [
+      "~Alejandro_Hernandez_Ruiz1",
+      "~Armand_Vilalta1",
+      "~Francesc_Moreno-Noguer1"
+    ],
+    "rank": 2143
+  },
+  {
+    "url": "https://openreview.net/forum?id=PU35uLgRZkk",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "  The option framework, one of the most promising Hierarchical Reinforcement Learning (HRL) frameworks, is developed based on the Semi-Markov Decision Problem (SMDP) and employs a triple formulation of the option (i.e., an action policy, a termination probability, and an initiation set). These design choices, however, mean that the option framework: 1) has low sample efficiency, 2) cannot use more stable Markov Decision Problem (MDP) based learning algorithms, 3) represents abstract actions implicitly, and 4) is expensive to scale up. To overcome these problems, here we propose a simple yet effective MDP implementation of the option framework: the Skill-Action (SA) architecture. Derived from a novel discovery that the SMDP option framework has an MDP equivalence, SA hierarchically extracts skills (abstract actions) from primary actions and explicitly encodes these knowledge into skill context vectors (embedding vectors). Although SA is MDP formulated, skills can still be temporally extended by applying the attention mechanism to skill context vectors. Unlike the option framework, which requires <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> action policies for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi></math></mjx-assistive-mml></mjx-container> skills, SA's action policy only needs one decoder to decode skill context vectors into primary actions. Under this formulation, SA can be optimized with any MDP based policy gradient algorithm. Moreover, it is sample efficient, cheap to scale up, and theoretically proven to have lower variance. Our empirical studies on challenging infinite horizon robot simulation environments demonstrate that SA not only outperforms all baselines by a large margin, but also exhibits smaller variance, faster convergence, and good interpretability. On transfer learning tasks, SA also outperforms the other models and shows its advantage on reusing knowledge across tasks. A potential impact of SA is to pave the way for a large scale pre-training architecture in the reinforcement learning area.",
+    "title": "The Skill-Action Architecture: Learning Abstract Action Embeddings for Reinforcement Learning",
+    "authors": [
+      "Chang Li",
+      "Dongjin Song",
+      "Dacheng Tao"
+    ],
+    "emails": [
+      "~Chang_Li5",
+      "~Dongjin_Song2",
+      "~Dacheng_Tao1"
+    ],
+    "rank": 2144
+  },
+  {
+    "url": "https://openreview.net/forum?id=5xaInvrGWp",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "In federated learning, data is distributed among local clients which collaboratively train a prediction model using secure aggregation. To preserve the privacy of the clients, the federated learning paradigm requires each client to maintain a private local training data set, and only uploads its summarized model updates to the server. In this work, we show that this paradigm could lead to a vulnerable model, which collapses in performance when the corrupted data samples (under adversarial manipulations) are used for prediction after model deployment. To improve model robustness, we first decompose the aggregation error of the central server into bias and variance, and then, propose a robust federated learning framework, named Fed_BVA, that performs on-device adversarial training using the bias-variance oriented adversarial examples supplied by the server via asymmetrical communications. The experiments are conducted on multiple benchmark data sets using several prevalent neural network models, and the empirical results show that our framework is robust against white-box and black-box adversarial corruptions under both IID and non-IID settings.  ",
+    "title": "Adversarially Robust Federated Learning for Neural Networks",
+    "authors": [
+      "Yao Zhou",
+      "Jun Wu",
+      "Jingrui He"
+    ],
+    "emails": [
+      "~Jun_Wu3",
+      "~Jingrui_He1",
+      "~Yao_Zhou3"
+    ],
+    "rank": 2145
+  },
+  {
+    "url": "https://openreview.net/forum?id=6MaBrlQ5JM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "A crucial problem in neural networks is to select the most appropriate number of hidden neurons and obtain tight statistical risk bounds. In this work, we present a new perspective towards the bias-variance tradeoff in neural networks. As an alternative to selecting the number of neurons, we theoretically show that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> regularization can control the generalization error and sparsify the input dimension. In particular, with an appropriate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> regularization on the output layer, the network can produce a statistical risk that is near minimax optimal. Moreover, an appropriate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container> regularization on the input layer leads to a risk bound that does not involve the input data dimension. Our analysis is based on a new amalgamation of dimension-based and norm-based complexity analysis to bound the generalization error. A consequent observation from our results is that an excessively large number of neurons do not necessarily inflate generalization errors under a suitable regularization.\n",
+    "title": "THE EFFICACY OF L1 REGULARIZATION IN NEURAL NETWORKS",
+    "authors": [
+      "Gen Li",
+      "Yuantao Gu",
+      "Jie Ding"
+    ],
+    "emails": [
+      "~Yuantao_Gu1",
+      "mails.tsinghua.edu.cn",
+      "~Jie_Ding2"
+    ],
+    "rank": 2146
+  },
+  {
+    "url": "https://openreview.net/forum?id=hypDstHla7",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recent experiments indicate that pre-training of end-to-end Reinforcement Learning neural networks on general tasks can speed up the training process for specific robotic applications. However, it remains open if these networks form general feature extractors and a hierarchical organization that are reused as apparent e.g. in Convolutional Neural Networks. In this paper we analyze the intrinsic neuron activation in networks trained for target reaching of robot manipulators with increasing joint number in a vertical plane. We analyze the individual neuron activity distribution in the network, introduce a pruning algorithm to reduce network size keeping the performance, and with these dense network representations we spot correlations of neuron activity patterns among networks trained for robot manipulators with different joint number. We show that the input and output network layers have more distinct neuron activation in contrast to inner layers. Our pruning algorithm reduces the network size significantly, increases the distance of neuron activation while keeping a high performance in training and evaluation. Our results demonstrate that neuron activity can be mapped among networks trained for robots with different complexity. Hereby, robots with small joint difference show higher layer-wise projection accuracy whereas more different robots mostly show projections to the first layer.",
+    "title": "Neuron Activation Analysis for Multi-Joint Robot Reinforcement Learning",
+    "authors": [
+      "Benedikt Feldotto",
+      "Heiko Lengenfelder",
+      "Alois Knoll"
+    ],
+    "emails": [
+      "~Alois_Knoll1",
+      "~Benedikt_Feldotto1",
+      "tum.de"
+    ],
+    "rank": 2147
+  },
+  {
+    "url": "https://openreview.net/forum?id=npOuXc85I5k",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Adversarial robustness, mainly including sensitivity-based robustness and spatial robustness, plays an integral part in the robust generalization. In this paper, we endeavor to design strategies to achieve comprehensive adversarial robustness. To hit this target, firstly we investigate the less-studied spatial robustness and then integrate existing spatial robustness methods by incorporating both local and global spatial vulnerability into one spatial attack design. Based on this exploration, we further present a comprehensive relationship between natural accuracy, sensitivity-based and different spatial robustness, supported by the strong evidence from the perspective of representation. More importantly, in order to balance these mutual impact within different robustness into one unified framework, we incorporate the Pareto criterion into the adversarial robustness analysis, yielding a novel strategy towards comprehensive robustness called \\textit{Pareto Adversarial Training}. The resulting Pareto front, the set of optimal solutions, provides the set of optimal balance among natural accuracy and different adversarial robustness, shedding light on solutions towards comprehensive robustness in the future. To the best of our knowledge, we are the first to consider comprehensive robustness via the multi-objective optimization.",
+    "title": "Pareto Adversarial Robustness: Balancing Spatial Robustness and Sensitivity-based Robustness",
+    "authors": [
+      "Ke Sun",
+      "Mingjie Li",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Zhouchen_Lin1",
+      "~Mingjie_Li1",
+      "~Ke_Sun3"
+    ],
+    "rank": 2148
+  },
+  {
+    "url": "https://openreview.net/forum?id=sI4SVtktqJ2",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      6
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      5,
+      2,
+      1
+    ],
+    "abstract": "The randomized smoothing with various noise distributions is a promising approach to protect classifiers from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> adversarial attacks. However, it requires an ensemble of classifiers trained with different noise types and magnitudes, which is computationally expensive. In this work, we present an efficient method for randomized smoothing that does not require any re-training of classifiers. We built upon denoised smoothing, which prepends denoiser to the pre-trained classifier. We investigate two approaches to the image denoising problem for randomized smoothing and show that using the score function suits for both. Moreover, we present an efficient algorithm that can scale to randomized smoothing and can be applied regardless of noise types or levels. To validate, we demonstrate the effectiveness of our methods through extensive experiments on CIFAR-10 and ImageNet, under various <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> adversaries.",
+    "title": "Efficient randomized smoothing by denoising with learned score function",
+    "authors": [
+      "Kyungmin Lee",
+      "Seyoon Oh"
+    ],
+    "emails": [
+      "~Kyungmin_Lee1",
+      "add.re.kr"
+    ],
+    "rank": 2149
+  },
+  {
+    "url": "https://openreview.net/forum?id=Oi-Kh379U0",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Recently, a special kind of graph, i.e., supernet, which allows two nodes connected by multi-choice edges, has exhibited its power in neural architecture search (NAS) by searching better architectures for computer vision (CV) and natural language processing (NLP) tasks. In this paper, we discover that the design of such discrete architectures also appears in many other important learning tasks, e.g., logical chain inference in knowledge graphs (KGs) and meta-path discovery in heterogeneous information networks (HINs). Thus, we are motivated to generalize the supernet search problem on a broader horizon. However, none of the existing works are effective since the supernet's topology is highly task-dependent and diverse. To address this issue, we propose to tensorize the supernet, i.e. unify the subgraph search problems by a tensor formulation and encode the topology inside the supernet by a tensor network. We further propose an efficient algorithm that admits both stochastic and deterministic objectives to solve the search problem. Finally, we perform extensive experiments on diverse learning tasks, i.e., architecture design for CV, logic inference for KG, and meta-path discovery for HIN. Empirical results demonstrate that our method leads to better performance and architectures.\n",
+    "title": "Generalizing and Tensorizing Subgraph Search in the Supernet",
+    "authors": [
+      "Hansi Yang",
+      "quanming yao"
+    ],
+    "emails": [
+      "~quanming_yao1",
+      "~Hansi_Yang1"
+    ],
+    "rank": 2150
+  },
+  {
+    "url": "https://openreview.net/forum?id=BeDgBhZP7S",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Learning dynamics in deep neural networks are still a subject of debate. In particular, the identification of eventual differences regarding how deep models learn from frequent versus rare training examples is still an area of active research. In this work, we focus on studying the dynamics of memorization in deep neural networks, where we understand memorization as the process of learning from rare or unusual training examples that are part of the long-tail of a dataset. As a working hypothesis, we speculate that during learning some weights focus on mining patterns from frequent examples while others are in charge of memorizing rare long-tail samples. Using this idea, we develop a method for uncovering which weights focus on mining frequent patterns and which ones focus on memorization. Following previous studies, we empirically verify that deep neural networks learn frequent patterns first and then focus on memorizing long-tail examples. Furthermore, our results show that during training a small proportion of the total weights present an early convergence to model frequent patterns, while the vast majority of the weights present a slow convergence to model long-tail examples. We also find that memorization happens mostly at the first layers of a network and not at the level of classification. Finally, by analyzing performance differences for models trained with varying levels of long-tail samples, we find that a larger number of long-tail samples has a negative impact on learning frequent patterns, by a process we conjecture to force the model to learn frequent patterns as memorization.",
+    "title": "Catching the Long Tail in Deep Neural Networks",
+    "authors": [
+      "Julio Hurtado",
+      "Alain Raymond",
+      "Alvaro Soto"
+    ],
+    "emails": [
+      "~Alvaro_Soto1",
+      "~Alain_Raymond1",
+      "~Julio_Hurtado1"
+    ],
+    "rank": 2151
+  },
+  {
+    "url": "https://openreview.net/forum?id=-p6rexF3qdQ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "\tIt is now widely known that by adversarial attacks, clean images with invisible perturbations can fool deep neural networks.\n\tTo defend adversarial attacks, we design a block containing multiple paths to learn robust features and the parameters of these paths are  required to be orthogonal with each other. \n\tThe so-called Orthogonal Multi-Path (OMP) block could be posed in any layer of a neural network. \n\tVia forward learning and backward correction, one OMP block makes the neural networks learn features that are appropriate for all the paths and hence are expected to be robust. With careful design and thorough experiments on e.g., the positions of imposing orthogonality constraint, and the trade-off between the variety and accuracy, \n\tthe robustness of the neural networks is significantly improved. \n\tFor example, under white-box PGD attack with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> bound <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-texatom><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mn>8</mn></mrow><mrow><mo>/</mo></mrow><mrow><mn>255</mn></mrow></math></mjx-assistive-mml></mjx-container> (this is a fierce attack that can make the accuracy of many vanilla neural networks drop to nearly <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> on CIFAR10), VGG16 with the proposed OMP block could keep over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>50</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> accuracy. For black-box attacks, neural networks equipped with an OMP block have accuracy over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>80</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container>. The performance under both white-box and black-box attacks is much better than the existing state-of-the-art adversarial defenders. ",
+    "title": "Learn Robust Features via Orthogonal Multi-Path",
+    "authors": [
+      "Kun Fang",
+      "Xiaolin Huang",
+      "Yingwen Wu",
+      "Tao Li",
+      "Jie Yang"
+    ],
+    "emails": [
+      "sjtu.edu.cn",
+      "~Xiaolin_Huang1",
+      "~Tao_Li12",
+      "~Yingwen_Wu1",
+      "~Kun_Fang1"
+    ],
+    "rank": 2152
+  },
+  {
+    "url": "https://openreview.net/forum?id=j0yLJ-MsgJ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Few-shot learning aims to train models on a limited number of labeled samples from a support set in order to generalize to unseen samples from a query set. In the standard setup, the support set contains an equal amount of data points for each class. This assumption overlooks many practical considerations arising from the dynamic nature of the real world, such as class imbalance. In this paper, we present a detailed study of few-shot class imbalance along three axes: dataset vs. support set imbalance, effect of different imbalance distributions (linear, step, random), and effect of rebalancing techniques. We extensively compare over 10 state-of-the-art few-shot learning methods using backbones of different depths on multiple datasets. Our analysis reveals that 1) compared to the balanced task, the performances of their class-imbalance counterparts always drop, by up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>18.0</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> for optimization-based methods, although feature-transfer and metric-based methods generally suffer less, 2) strategies used to mitigate imbalance in supervised learning can be adapted to the few-shot case resulting in better performances, 3) the effects of imbalance at the dataset level are less significant than the effects at the support set level. The code to reproduce the experiments is released under an open-source license.",
+    "title": "Class Imbalance in Few-Shot Learning",
+    "authors": [
+      "Mateusz Ochal",
+      "Massimiliano Patacchiola",
+      "Jose Vazquez",
+      "Amos Storkey",
+      "Sen Wang"
+    ],
+    "emails": [
+      "~Amos_Storkey1",
+      "~Sen_Wang7",
+      "~Massimiliano_Patacchiola1",
+      "~Mateusz_Ochal1",
+      "seebyte.com"
+    ],
+    "rank": 2153
+  },
+  {
+    "url": "https://openreview.net/forum?id=GboAslXYiBr",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Spoken language understanding (SLU) requires a model to analyze input acoustic signals to understand its linguistic content and make predictions. To boost the models' performance, various pre-training methods have been proposed to utilize large-scale unlabeled text and speech data. However, the inherent disparities between the two modalities necessitate a mutual analysis. In this paper, we propose a novel semi-supervised learning method, AlignNet, to jointly pre-train the speech and language modules. Besides a self-supervised masked language modeling of the two individual modules, AlignNet aligns representations from paired speech and transcripts in a shared latent semantic space. Thus, during fine-tuning, the speech module alone can produce representations carrying both acoustic information and contextual semantic knowledge. Experimental results verify the effectiveness of our approach on various SLU tasks. For example, AlignNet improves the previous state-of-the-art accuracy on the Spoken SQuAD dataset by 6.2%.",
+    "title": "Semi-Supervised Speech-Language Joint Pre-Training for Spoken Language Understanding",
+    "authors": [
+      "Yu-An Chung",
+      "Chenguang Zhu",
+      "Michael Zeng"
+    ],
+    "emails": [
+      "~Yu-An_Chung1",
+      "~Chenguang_Zhu1",
+      "~Michael_Zeng1"
+    ],
+    "rank": 2154
+  },
+  {
+    "url": "https://openreview.net/forum?id=NUCZeoVlAe",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      3
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "While deep learning is effective to learn features/representations from data, the distributions of samples in feature spaces learned by various architectures for different training tasks (e.g., latent layers of AEs and feature vectors in CNN classifiers) have not been well-studied or compared. We hypothesize that the feature spaces of networks trained by various architectures (AEs or CNNs) and tasks (supervised, unsupervised, or self-supervised learning) share some common subspaces, no matter what types of DNN architectures or whether the labels have been used in feature learning. To test our hypothesis, through Singular Value Decomposition (SVD) of feature vectors, we demonstrate that one could linearly project the feature vectors of the same group of samples to a similar distribution, where the distribution is represented as the top left singular vector (i.e., principal subspace of feature vectors), namely <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c50 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">P</mi></mrow></math></mjx-assistive-mml></mjx-container>-vectors. We further assess the convergence of feature space learning using angles between <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c50 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">P</mi></mrow></math></mjx-assistive-mml></mjx-container>-vectors obtained from the well-trained model and its checkpoint per epoch during the learning procedure, where a quasi-monotonic trend of convergence to small angles has been observed. Finally, we carry out case studies to connect <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c50 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">P</mi></mrow></math></mjx-assistive-mml></mjx-container>-vectors to the data distribution, and generalization performance. Extensive experiments with practically-used MLP, AE and CNN architectures for classification, image reconstruction, and self-supervised learning tasks on MNIST, CIFAR-10 and CIFAR-100 datasets have been done to support our claims with solid evidences.",
+    "title": "Empirical Studies on the Convergence of Feature Spaces in Deep Learning",
+    "authors": [
+      "Haoran Liu",
+      "Haoyi Xiong",
+      "Yaqing Wang",
+      "Haozhe An",
+      "Dongrui Wu",
+      "Dejing Dou"
+    ],
+    "emails": [
+      "~Haoran_Liu2",
+      "~Dongrui_Wu1",
+      "~Haozhe_An1",
+      "~Haoyi_Xiong1",
+      "~Dejing_Dou1",
+      "~Yaqing_Wang2"
+    ],
+    "rank": 2155
+  },
+  {
+    "url": "https://openreview.net/forum?id=mB_2tZ9d7L3",
+    "ratings": [
+      6,
+      4,
+      3,
+      6
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      5,
+      3,
+      2
+    ],
+    "abstract": "Convolutional Neural Networks (CNNs) have achieved tremendous success in a number of learning tasks, e.g., image classification. Recent advances in CNNs, such as ResNets and DenseNets, mainly focus on the skip and concatenation operators to avoid gradient vanishing. However, such operators largely neglect information across layers (as in ResNets) or involve tremendous redundancy of features repeatedly copied from previous layers (as in DenseNets). In this paper, we design Attentive Feature Integration (AFI) modules, which can be applicable to most recent network architectures, leading to new architectures named as AFINets.  AFINets can by adaptively integrate distinct information through explicitly modeling the subordinate relationship between different levels of features. Experimental results on benchmark datasets have demonstrated the effectiveness of the proposed AFI modules.  \n",
+    "title": "AFINets: Attentive Feature Integration Networks for Image Classification",
+    "authors": [
+      "Xinglin Pan",
+      "Jing Xu",
+      "Yu Pan",
+      "WenXiang Lin",
+      "Liangjian Wen",
+      "Zenglin Xu"
+    ],
+    "emails": [
+      "~Zenglin_Xu1",
+      "~WenXiang_Lin1",
+      "~Yu_Pan1",
+      "~Liangjian_Wen1",
+      "~Xinglin_Pan1",
+      "~Jing_Xu1"
+    ],
+    "rank": 2156
+  },
+  {
+    "url": "https://openreview.net/forum?id=VlRqY4sV9FO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7,
+      4
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "The importance of explainability in machine learning continues to grow, as both neural-network architectures and the data they model become increasingly complex. Unique challenges arise when a model's input features become high-dimensional: on one hand, principled model-agnostic approaches to explainability become too computationally expensive; on the other, more efficient explainability algorithms lack natural interpretations for general users. In this work, we introduce a framework for human-interpretable explainability on high-dimensional data, consisting of two modules. First, we apply a semantically-meaningful latent representation, both to reduce the raw dimensionality of the data, and to ensure its human interpretability. These latent features can be learnt, e.g. explicitly as disentangled representations or implicitly through image-to-image translation, or they can be based on any computable quantities the user chooses. Second, we adapt the Shapley paradigm for model-agnostic explainability to operate on these latent features. This leads to interpretable model explanations that are both theoretically controlled and computationally tractable. We benchmark our approach on synthetic data and demonstrate its effectiveness on several image-classification tasks.",
+    "title": "Human-interpretable model explainability on high-dimensional data",
+    "authors": [
+      "Damien de Mijolla",
+      "Christopher Frye",
+      "Markus Kunesch",
+      "John Mansir",
+      "Ilya Feige"
+    ],
+    "emails": [
+      "~Christopher_Frye1",
+      "faculty.ai",
+      "gmail.com",
+      "~Ilya_Feige1",
+      "cantab.net"
+    ],
+    "rank": 2157
+  },
+  {
+    "url": "https://openreview.net/forum?id=pQ-AoEbNYQK",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The automated machine learning (AutoML) pipeline comprises several crucial components such as automated data augmentation (DA), neural architecture search (NAS) and hyper-parameter optimization (HPO). Although many strategies have been developed for automating each component in separation, joint optimization of these components remains challenging due to the largely increased search dimension and different input types required for each component. While conducting these components in sequence is usually adopted as a workaround, it often requires careful coordination by human experts and may lead to sub-optimal results. In parallel to this, the common practice of searching for the optimal architecture first and then retraining it before deployment in NAS often suffers from architecture performance difference in the search and retraining stages. An end-to-end solution that integrates the two stages and returns a ready-to-use model at the end of the search is desirable. In view of these, we propose a differentiable joint optimization solution for efficient end-to-end AutoML (DiffAutoML). Our method performs co-optimization of the neural architectures, training hyper-parameters and data augmentation policies in an end-to-end fashion without the need of model retraining.  Experiments show that DiffAutoML achieves state-of-the-art results on ImageNet compared with end-to-end AutoML algorithms, and achieves superior performance compared with multi-stage AutoML algorithms with higher computational efficiency.\nTo the best of our knowledge, we are the first to jointly optimize automated DA, NAS and HPO in an en-to-end manner without retraining.",
+    "title": "DiffAutoML: Differentiable Joint Optimization for Efficient End-to-End Automated Machine Learning",
+    "authors": [
+      "Kaichen Zhou",
+      "Lanqing HONG",
+      "Fengwei Zhou",
+      "Binxin Ru",
+      "Zhenguo Li",
+      "Trigoni Niki",
+      "Jiashi Feng"
+    ],
+    "emails": [
+      "~Jiashi_Feng1",
+      "~Fengwei_Zhou1",
+      "~Lanqing_HONG1",
+      "~Binxin_Ru1",
+      "cs.ox.ac.uk",
+      "~Kaichen_Zhou1",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 2158
+  },
+  {
+    "url": "https://openreview.net/forum?id=3zaVN0M0BIb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In supervised learning, it is known that overparameterized neural networks with one hidden layer provably and efficiently learn and generalize, when trained using Stochastic Gradient Descent (SGD). In contrast, the benefit of overparameterization in unsupervised learning is not well understood. Normalizing flows (NFs) learn to map complex real-world distributions into simple base distributions, and constitute an important class of models in unsupervised learning for sampling and density estimation. In this paper, we theoretically and empirically analyze these models when the underlying neural network is one hidden layer overparametrized network. On the one hand we provide evidence that for a  class of NFs, overparametrization hurts training. On the other, we prove that another class of NFs, with similar underlying networks can efficiently learn any reasonable data distribution under minimal assumptions. We extend theoretical ideas on learning and generalization from overparameterized neural networks in supervised learning to overparameterized normalizing flows in unsupervised learning. We also provide experimental validation to support our theoretical analysis in practice.",
+    "title": "Learning and Generalization in Univariate Overparameterized Normalizing Flows",
+    "authors": [
+      "Kulin Shah",
+      "Amit Deshpande",
+      "Navin Goyal"
+    ],
+    "emails": [
+      "~Amit_Deshpande1",
+      "~Navin_Goyal1",
+      "~Kulin_Shah1"
+    ],
+    "rank": 2159
+  },
+  {
+    "url": "https://openreview.net/forum?id=zOGdf9K8aC",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.64",
+    "confidences": [
+      4,
+      1,
+      5,
+      4
+    ],
+    "abstract": "Density estimation, compression, and data generation are crucial tasks in artificial intelligence. Variational Auto-Encoders (VAEs) constitute a single framework to achieve these goals. Here, we present a novel class of generative models, called self-supervised Variational Auto-Encoder (selfVAE), that utilizes deterministic and discrete transformations of data. This class of models allows performing both conditional and unconditional sampling while simplifying the objective function. First, we use a single self-supervised transformation as a latent variable, where a transformation is either downscaling or edge detection. Next, we consider a hierarchical architecture, i.e., multiple transformations, and we show its benefits compared to the VAE. The flexibility of selfVAE in data reconstruction finds a particularly interesting use case in data compression tasks, where we can trade-off memory for better data quality, and vice-versa. We present the performance of our approach on three benchmark image data (Cifar10, Imagenette64, and CelebA).",
+    "title": "Self-Supervised Variational Auto-Encoders",
+    "authors": [
+      "Ioannis Gatopoulos",
+      "Jakub Mikolaj Tomczak"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Jakub_Mikolaj_Tomczak1"
+    ],
+    "rank": 2160
+  },
+  {
+    "url": "https://openreview.net/forum?id=5Spjp0zDYt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Variational Auto-encoders (VAEs) are deep generative latent variable models that are widely used for a number of downstream tasks. While it has been demonstrated that VAE training can suffer from a number of pathologies, existing literature lacks characterizations of exactly when these pathologies occur and how they impact down-stream task performance. In this paper we concretely characterize conditions under which VAE training exhibits pathologies and connect these failure modes to undesirable effects on specific downstream tasks, such as learning compressed and disentangled representations, adversarial robustness and semi-supervised learning.",
+    "title": "Failure Modes of Variational Autoencoders and Their Effects on Downstream Tasks",
+    "authors": [
+      "Yaniv Yacoby",
+      "Weiwei Pan",
+      "Finale Doshi-Velez"
+    ],
+    "emails": [
+      "~Finale_Doshi-Velez1",
+      "~Weiwei_Pan1",
+      "~Yaniv_Yacoby1"
+    ],
+    "rank": 2161
+  },
+  {
+    "url": "https://openreview.net/forum?id=2oci5kFXE0o",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.64",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "In this work, we focus on an analogical reasoning task that contains rich compositional structures, Raven's Progressive Matrices (RPM). To discover compositional structures of the data, we propose the Scattering Compositional Learner (SCL), an architecture that composes neural networks in a sequence. Our SCL achieves state-of-the-art performance on two RPM datasets, with a 48.7% relative improvement on Balanced-RAVEN and 26.4% on PGM over the previous state-of-the-art. We additionally show that our model discovers compositional representations of objects' attributes (e.g., shape color, size), and their relationships (e.g., progression, union). We also find that the compositional representation makes the SCL significantly more robust to test-time domain shifts and greatly improves zero-shot generalization to previously unseen analogies.",
+    "title": "The Scattering Compositional Learner: Discovering Objects, Attributes, Relationships in Analogical Reasoning",
+    "authors": [
+      "Yuhuai Wu",
+      "Honghua Dong",
+      "Roger Baker Grosse",
+      "Jimmy Ba"
+    ],
+    "emails": [
+      "~Jimmy_Ba1",
+      "~Honghua_Dong1",
+      "~Roger_Baker_Grosse1",
+      "~Yuhuai_Wu1"
+    ],
+    "rank": 2162
+  },
+  {
+    "url": "https://openreview.net/forum?id=ODKwX19UjOj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Discovering concepts (or temporal abstractions) in an unsupervised manner from demonstration data in the absence of an environment is an important problem. Organizing these discovered concepts hierarchically at different levels of abstraction is useful in discovering patterns, building ontologies, and generating tutorials from demonstration data. However, recent work to discover such concepts without access to any environment does not discover relationships (or a hierarchy) between these discovered concepts.  In this paper, we present a Transformer-based concept abstraction architecture UNHCLE (pronounced uncle) that extracts a hierarchy of concepts in an unsupervised way from demonstration data. We empirically demonstrate how UNHCLE discovers meaningful hierarchies using datasets from Chess and Cooking domains. Finally, we show how UNHCLE learns meaningful language labels for concepts by using demonstration data augmented with natural language for cooking and chess.  All of our code is available at <a href=\"https://github.com/UNHCLE/UNHCLE\" target=\"_blank\" rel=\"nofollow\">https://github.com/UNHCLE/UNHCLE</a>\n",
+    "title": "Unsupervised Hierarchical Concept Learning",
+    "authors": [
+      "Sumegh Roychowdhury",
+      "Sumedh Anand Sontakke",
+      "Mausoom Sarkar",
+      "Nikaash Puri",
+      "Milan Aggarwal",
+      "Pinkesh Badjatiya",
+      "Balaji Krishnamurthy",
+      "Laurent Itti"
+    ],
+    "emails": [
+      "~Pinkesh_Badjatiya1",
+      "~Sumegh_Roychowdhury1",
+      "~Mausoom_Sarkar1",
+      "~Milan_Aggarwal1",
+      "~Balaji_Krishnamurthy1",
+      "~Nikaash_Puri1",
+      "~Sumedh_Anand_Sontakke1",
+      "~Laurent_Itti1"
+    ],
+    "rank": 2163
+  },
+  {
+    "url": "https://openreview.net/forum?id=7MjfPd-Irao",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      7
+    ],
+    "rating": "4.64",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Procedurally-generated sparse reward environments pose significant challenges for many RL algorithms. The recently proposed impact-driven exploration method (RIDE) by Raileanu &amp; Rockt\u00e4schel (2020), which rewards actions that lead to large changes (measured by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-distance) in the observation embedding, achieves state-of-the-art performance on such procedurally-generated MiniGrid tasks. Yet, the definition of \"impact\" in RIDE is not conceptually clear because its learned embedding space is not inherently equipped with any similarity measure, let alone <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>-distance. We resolve this issue in RIDE via contrastive learning. That is, we train the embedding with respect to cosine similarity, where we define two observations to be similar if the agent can reach one observation from the other within a few steps, and define impact in terms of this similarity measure. Experimental results show that our method performs similarly to RIDE on the MiniGrid benchmarks while learning a conceptually clear embedding space equipped with the cosine similarity measure. Our modification of RIDE also provides a new perspective which connects RIDE and episodic curiosity (Savinov et al., 2019), a different exploration method which rewards the agent for visiting states that are unfamiliar to the agent's episodic memory. By incorporating episodic memory into our method, we outperform RIDE on the MiniGrid benchmarks.",
+    "title": "Impact-driven Exploration with Contrastive Unsupervised Representations",
+    "authors": [
+      "Min Jae Song",
+      "Dan Kushnir"
+    ],
+    "emails": [
+      "~Dan_Kushnir1",
+      "~Min_Jae_Song1"
+    ],
+    "rank": 2164
+  },
+  {
+    "url": "https://openreview.net/forum?id=0qbEq5UBfGD",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Time series data is abundantly available in the real world, but there is a distinct lack of large, labeled datasets available for many types of learning tasks. Semi-supervised models, which can leverage small amounts of expert-labeled data along with a larger unlabeled dataset, have been shown to improve performance over unsupervised learning models. Existing semi-supervised time series clustering algorithms suffer from lack of scalability as they are limited to perform learning operations within the original data space. We propose an autoencoder-based semi-supervised learning model along with multiple semi-supervised objective functions which can be used to improve the quality of the autoencoder\u2019s learned latent space via the addition of a small number of labeled examples. Experiments on a variety of datasets show that our methods can usually improve k-Means clustering performance. Our methods achieve a maximum average ARI of 0.897, a 140% increase over an unsupervised CAE model. Our methods also achieve a maximum improvement of 44% over a semi-supervised model.\n",
+    "title": "Latent Space Semi-Supervised Time Series Data Clustering",
+    "authors": [
+      "Andrew Hill",
+      "Katerina Kechris",
+      "Russell Bowler",
+      "Farnoush Kashani"
+    ],
+    "emails": [
+      "~Andrew_Hill2",
+      "njhealth.org",
+      "~Farnoush_Kashani1",
+      "cuanschutz.edu"
+    ],
+    "rank": 2165
+  },
+  {
+    "url": "https://openreview.net/forum?id=qU-eouoIyAy",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      5,
+      7,
+      5,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "We introduce a new framework for hyperrealistic reconstruction of perceived naturalistic stimuli from brain recordings. To this end, we embrace the use of generative adversarial networks (GANs) at the earliest step of our neural decoding pipeline by acquiring functional magnetic resonance imaging data as subjects perceived face images created by the generator network of a GAN. Subsequently, we used a decoding approach to predict the latent state of the GAN from brain data. Hence, latent representations for stimulus (re-)generation are obtained, leading to state-of-the-art image reconstructions. Altogether, we have developed a highly promising approach for decoding sensory perception from brain activity and systematically analyzing neural information processing in the human brain.",
+    "title": "Hyperrealistic neural decoding: Reconstruction of face stimuli from fMRI measurements via the GAN latent space",
+    "authors": [
+      "Thirza Dado",
+      "Ya\u011fmur G\u00fc\u00e7l\u00fct\u00fcrk",
+      "Luca Ambrogioni",
+      "Gabrielle Ras",
+      "Sander E. Bosch",
+      "Marcel van Gerven",
+      "Umut G\u00fc\u00e7l\u00fc"
+    ],
+    "emails": [
+      "~Ya\u011fmur_G\u00fc\u00e7l\u00fct\u00fcrk1",
+      "donders.ru.nl",
+      "~Thirza_Dado1",
+      "~Umut_G\u00fc\u00e7l\u00fc1",
+      "~Luca_Ambrogioni1",
+      "~Marcel_van_Gerven1",
+      "~Gabrielle_Ras1"
+    ],
+    "rank": 2166
+  },
+  {
+    "url": "https://openreview.net/forum?id=Be7Z5EfYp-Q",
+    "ratings": [
+      3,
+      4,
+      7,
+      5
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      1,
+      3,
+      4
+    ],
+    "abstract": "Phase retrieval is the inverse problem of recovering a signal from magnitude-only Fourier measurements, and underlies numerous imaging modalities, such as Coherent Diffraction Imaging (CDI). A variant of this setup, known as holography, includes a reference object that is placed adjacent to the specimen of interest before measurements are collected. The resulting inverse problem, known as holographic phase retrieval, is well-known to have improved problem conditioning relative to the original. This innovation, i.e. Holographic CDI, becomes crucial at the nanoscale, where imaging specimens such as viruses, proteins, and crystals require low-photon measurements. This data is highly corrupted by Poisson shot noise, and often lacks low-frequency content as well. In this work, we introduce a dataset-free deep learning framework for holographic phase retrieval adapted to these challenges. The key ingredients of our approach are the explicit and flexible incorporation of the physical forward model into the automatic differentiation procedure, the Poisson log-likelihood objective function, and an optional untrained deep image prior. We perform extensive evaluation under realistic conditions. Compared to competing classical methods, our method recovers signal from higher noise levels and is more resilient to suboptimal reference design, as well as to large missing regions of low-frequencies  in the observations. To the best of our knowledge, this is the first work to consider a dataset-free machine learning approach for holographic phase retrieval.",
+    "title": "Practical Phase Retrieval: Low-Photon Holography with Untrained Priors",
+    "authors": [
+      "Hannah Lawrence",
+      "David Barmherzig",
+      "Henry Li",
+      "Michael Eickenberg",
+      "Marylou Gabri\u00e9"
+    ],
+    "emails": [
+      "~Marylou_Gabri\u00e91",
+      "~Henry_Li2",
+      "mit.edu",
+      "flatironinstitute.org",
+      "~Michael_Eickenberg3"
+    ],
+    "rank": 2167
+  },
+  {
+    "url": "https://openreview.net/forum?id=QJc4HWzF7FW",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Meta continual learning algorithms seek to train a model when faced with similar tasks observed in a sequential manner. Despite promising methodological advancements, there is a lack of theoretical frameworks that enable analysis of learning challenges such as generalization and catastrophic forgetting. To that end, we develop a new theoretical approach for meta continual learning~(MCL) where we mathematically model the learning dynamics using dynamic programming, and we establish conditions of optimality for the MCL problem.  Moreover, using the theoretical framework, we derive a new dynamic-programming-based MCL method that adopts stochastic-gradient-driven alternating optimization to balance generalization and catastrophic forgetting. We show that, on MCL benchmark data sets, our theoretically grounded method achieves accuracy better than or comparable to that of existing state-of-the-art methods.",
+    "title": "Meta-Continual Learning Via Dynamic Programming",
+    "authors": [
+      "Krishnan Raghavan",
+      "Prasanna Balaprakash"
+    ],
+    "emails": [
+      "~Krishnan_Raghavan1",
+      "~Prasanna_Balaprakash1"
+    ],
+    "rank": 2168
+  },
+  {
+    "url": "https://openreview.net/forum?id=TwkEGci1Y-",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.62",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Few-shot learning aims to classify unknown classes of examples with a few new examples per class. There are two key routes for few-shot learning. One is to (pre-)train a classifier with examples from known classes, and then transfer the pre-trained classifier to unknown classes using the new examples. The other, called meta few-shot learning, is to couple pre-training with episodic training, which contains episodes of few-shot learning tasks simulated from the known classes. Pre-training is known to play a crucial role for the transfer route, but the role of pre-training for the episodic route is less clear. In this work, we study the role of pre-training for the episodic route. We find that pre-training serves a major role of disentangling representations of known classes, which makes the resulting learning tasks easier for episodic training. The finding allows us to shift the huge simulation burden of episodic learning to a simpler pre-training stage. We justify such a benefit of shift by designing a new disentanglement-based pre-training model, which helps episodic learning achieve competitive performance more efficiently. ",
+    "title": "On the Role of Pre-training for Meta Few-Shot Learning",
+    "authors": [
+      "Chia-You Chen",
+      "Hsuan-Tien Lin",
+      "Gang Niu",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Chia-You_Chen1",
+      "~Hsuan-Tien_Lin1",
+      "~Gang_Niu1",
+      "~Masashi_Sugiyama1"
+    ],
+    "rank": 2169
+  },
+  {
+    "url": "https://openreview.net/forum?id=q_Q9MMGwSQu",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Refraining from confidently predicting when faced with  categories of inputs different from those seen during training is an important requirement for the safe deployment of deep learning systems. While simple to state, this has been  a particularly challenging problem in deep learning, where models often end up making overconfident predictions in such situations. In this work we present a simple, but highly effective approach to deal with out-of-distribution detection that uses the principle of abstention: when encountering a sample from an unseen class, the desired behavior is to abstain from predicting. Our  approach uses a network with an extra abstention class and is trained on a dataset that  is augmented with an uncurated  set that consists of a large number of out-of-distribution (OoD) samples that are assigned the label of the abstention class; the model is then trained to learn an effective discriminator between in and out-of-distribution samples. \n\n We  compare this relatively simple approach against a wide variety of more complex methods that have been proposed both for out-of-distribution detection as well as uncertainty modeling in deep learning, and empirically demonstrate its effectiveness on a wide variety of of benchmarks and deep architectures for image recognition and text classification, often outperforming existing approaches by significant margins. Given the simplicity and effectiveness of this method, we propose that this approach be used as a new additional baseline for future work in this domain.",
+    "title": "A Simple and Effective  Baseline for Out-of-Distribution Detection using Abstention",
+    "authors": [
+      "Sunil Thulasidasan",
+      "Sushil Thapa",
+      "Sayera Dhaubhadel",
+      "Gopinath Chennupati",
+      "Tanmoy Bhattacharya",
+      "Jeff Bilmes"
+    ],
+    "emails": [
+      "~Gopinath_Chennupati1",
+      "~Sushil_Thapa2",
+      "~Jeff_Bilmes1",
+      "lanl.gov",
+      "~Sunil_Thulasidasan1",
+      "~Tanmoy_Bhattacharya1"
+    ],
+    "rank": 2170
+  },
+  {
+    "url": "https://openreview.net/forum?id=pW--cu2FCHY",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We introduce Attention Free Transformer (AFT), an efficient variant of Transformers \\citep{transformer} that eliminates the need for dot product attention. AFT offers great simplicity and efficiency compared with standard Transformers, where the multi-head attention operation is replaced with the composition of element-wise multiplications/divisions and global/local pooling. During training time, AFT has linear time and space complexity w.r.t. both the sequence length and feature dimension; in the autoregressive decoding mode, AFT has constant memory and time complexity per step. We show that, surprisingly, we are able to train AFT effectively on challenging benchmarks, and also to match or surpass the standard Transformer counterparts and other efficient variants. In particular, AFT achieves the state-of-the-art result on CIFAR10 autoregressive modeling with much reduced complexity, and also outperforms several efficient Transformer variants on Enwik8.",
+    "title": "An Attention Free Transformer",
+    "authors": [
+      "Shuangfei Zhai",
+      "Walter Talbott",
+      "Nitish Srivastava",
+      "Chen Huang",
+      "Hanlin Goh",
+      "Joshua M. Susskind"
+    ],
+    "emails": [
+      "~Chen_Huang6",
+      "~Walter_Talbott1",
+      "~Shuangfei_Zhai3",
+      "~Joshua_M._Susskind1",
+      "~Hanlin_Goh2",
+      "~Nitish_Srivastava1"
+    ],
+    "rank": 2171
+  },
+  {
+    "url": "https://openreview.net/forum?id=sFDJNhwz7S",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Semantic hashing methods have been explored for learning transformations into binary vector spaces. These learned binary representations may then be used in hashing based retrieval methods, typically by retrieving all neighboring elements in the Hamming ball with radius 1 or 2.  Prior studies focus on tasks with a few dozen to a few hundred semantic categories at most, and it is not currently well known how these methods scale to domains with richer semantic structure.  In this study, we focus on learning embeddings for the use in exact hashing retrieval, where Approximate Nearest Neighbor search comprises of a simple table lookup. We propose similarity learning methods in which the optimized similarity is the angular similarity (the probability of collision under SimHash.)  We demonstrate the benefits of these embeddings on a variety of domains, including a coocurrence modelling task on a large scale text corpus; a rich structure of which cannot be handled by a few hundred semantic groups.",
+    "title": "Semantic Hashing with Locality Sensitive Embeddings",
+    "authors": [
+      "Levi Boyles",
+      "Aniket Anand Deshmukh",
+      "Urun Dogan",
+      "Rajesh Koduru",
+      "Charles Denis",
+      "Eren Manavoglu"
+    ],
+    "emails": [
+      "~Urun_Dogan1",
+      "microsoft.com",
+      "~Eren_Manavoglu1",
+      "~Aniket_Anand_Deshmukh1",
+      "~Levi_Boyles1"
+    ],
+    "rank": 2172
+  },
+  {
+    "url": "https://openreview.net/forum?id=IpPQmzj4T_",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We consider the limitations in message-passing graph neural networks. In message-passing operations, each node aggregates information from its neighboring nodes. To enlarge the receptive field, graph neural networks need to stack multiple message-passing graph convolution layers, which leads to the over-fitting issue and over-smoothing issue. To address these limitations, we propose a teleport graph convolution layer (TeleGCL) that uses teleport functions to enable each node to aggregate information from a much larger neighborhood. For each node, teleport functions select relevant nodes beyond the local neighborhood, thereby resulting in a larger receptive field. To apply our structure-aware teleport function, we propose a novel method to construct structural features for nodes in the graph. Based on our TeleGCL, we build a family of teleport graph convolutional networks. The empirical results on graph and node classification tasks demonstrate the effectiveness of our proposed methods.",
+    "title": "Teleport Graph Convolutional Networks",
+    "authors": [
+      "Hongyang Gao",
+      "Shuiwang Ji"
+    ],
+    "emails": [
+      "~Hongyang_Gao1",
+      "~Shuiwang_Ji1"
+    ],
+    "rank": 2173
+  },
+  {
+    "url": "https://openreview.net/forum?id=GEpTemgn7cq",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Promising results have driven a recent surge of interest in continuous optimization methods for Bayesian network structure learning from observational data. However, there are theoretical limitations on the identifiability of underlying structures obtained from observational data alone. Interventional data provides much richer information about the underlying data-generating process. However, the extension and application of methods designed for observational data to include interventions is not straightforward and remains an open problem. In this paper we provide a general framework based on continuous optimization and neural networks to create models for the combination of observational and interventional data. The proposed method is applicable even in the challenging and realistic case that the identity of the intervened upon variable is unknown. We examine the proposed method in the setting of graph recovery both de novo and from a partially-known edge set. We establish strong benchmark results on several structure learning tasks, including structure recovery of both synthetic graphs as well as standard graphs from the Bayesian Network Repository.",
+    "title": "Dependency Structure Discovery from Interventions",
+    "authors": [
+      "Nan Rosemary Ke",
+      "Olexa Bilaniuk",
+      "Anirudh Goyal",
+      "Stefan Bauer",
+      "Bernhard Sch\u00f6lkopf",
+      "Michael Curtis Mozer",
+      "Hugo Larochelle",
+      "Christopher Pal",
+      "Yoshua Bengio"
+    ],
+    "emails": [
+      "~Olexa_Bilaniuk1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Stefan_Bauer1",
+      "~Yoshua_Bengio1",
+      "~Hugo_Larochelle1",
+      "~Christopher_Pal1",
+      "~Michael_Curtis_Mozer1",
+      "~Anirudh_Goyal1",
+      "~Nan_Rosemary_Ke1"
+    ],
+    "rank": 2174
+  },
+  {
+    "url": "https://openreview.net/forum?id=f0sNwNeqqxx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.62",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Federated learning (FL) is a technique that trains machine learning models from decentralized data sources. We study FL under local differential privacy constraints, which provides strong protection against sensitive data disclosures via obfuscating the data before leaving the client. We identify two major concerns in designing practical privacy-preserving FL algorithms: communication efficiency and high-dimensional compatibility. We then develop a gradient-based learning algorithm called \\emph{sqSGD} (selective quantized stochastic gradient descent) that addresses both concerns. The proposed algorithm is based on a novel privacy-preserving quantization scheme that uses a constant number of bits per dimension per client. Then we improve the base algorithm in two ways: first, we apply a gradient subsampling strategy that offers simultaneously better training performance and smaller communication costs under a fixed privacy budget. Secondly, we utilize randomized rotation as a preprocessing step to reduce quantization error. We also initialize a discussion about the role of quantization and perturbation in FL algorithm design with privacy and communication constraints. Finally, the practicality of the proposed framework is demonstrated on benchmark datasets. Experiment results show that sqSGD successfully learns large models like LeNet and ResNet with local privacy constraints. In addition, with fixed privacy and communication level, the performance of sqSGD significantly dominates that of baseline algorithms.",
+    "title": "Practical Locally Private Federated Learning with Communication Efficiency",
+    "authors": [
+      "Yan Feng",
+      "Tao Xiong",
+      "Ruofan Wu",
+      "Yuan Qi"
+    ],
+    "emails": [
+      "~Yan_Feng2",
+      "~Ruofan_Wu1",
+      "antgroup.com"
+    ],
+    "rank": 2175
+  },
+  {
+    "url": "https://openreview.net/forum?id=6HlaJSlQFEj",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      6,
+      3
+    ],
+    "rating": "4.62",
+    "confidences": [
+      3,
+      5,
+      3,
+      5
+    ],
+    "abstract": "While deep neural networks show unprecedented performance in various tasks, the vulnerability to adversarial examples hinders their deployment in safety-critical systems. Many studies have shown that attacks are also possible even in a  black-box setting where an adversary cannot access the target model's internal information. Most black-box attacks are based on queries, each of which obtains the target model's output for an input, and many recent studies focus on reducing the number of required queries. In this paper, we pay attention to an implicit assumption of these attacks that the target model's output exactly corresponds to the query input. If some randomness is introduced into the model to break this assumption, query-based attacks may have tremendous difficulty in both gradient estimation and local search, which are the core of their attack process. From this motivation, we observe even a small additive input noise can neutralize most query-based attacks and name this simple yet effective approach Small Noise Defense (SND). We analyze how SND can defend against query-based black-box attacks and demonstrate its effectiveness against eight different state-of-the-art attacks with CIFAR-10 and ImageNet datasets. Even with strong defense ability, SND almost maintains the original clean accuracy and computational speed. SND is readily applicable to pre-trained models by adding only one line of code at the inference stage, so we hope that it will be used as a baseline of defense against query-based black-box attacks in the future.",
+    "title": "Small Input Noise is Enough to Defend Against Query-based Black-box Attacks",
+    "authors": [
+      "Junyoung Byun",
+      "Hyojun Go",
+      "Changick Kim"
+    ],
+    "emails": [
+      "~Junyoung_Byun2",
+      "~Changick_Kim1",
+      "kaist.ac.kr"
+    ],
+    "rank": 2176
+  },
+  {
+    "url": "https://openreview.net/forum?id=VKoY98_IN3-",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.62",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "We present a novel approach for image-animation of a source image by a driving video, both depicting the same type of object. We do not assume the existence of pose models and our method is able to animate arbitrary objects without knowledge of the object's structure. Furthermore, both the driving video and the source image are only seen during test-time. Our method is based on a shared mask generator, which separates the foreground object from its background, and captures the object\u2019s general pose and shape. A mask-refinement module then replaces, in the mask extracted from the driver image, the identity of the driver with the identity of the source. Conditioned on the source image, the transformed mask is then decoded by a multi-scale generator that renders a realistic image, in which the content of the source frame is animated by the pose in the driving video. Our method is shown to greatly outperform the state of the art methods on multiple benchmarks. Our code and samples are attached as supplementary.",
+    "title": "Image Animation with Refined Masking",
+    "authors": [
+      "yoav shalev",
+      "Lior Wolf"
+    ],
+    "emails": [
+      "~Lior_Wolf1",
+      "~yoav_shalev1"
+    ],
+    "rank": 2177
+  },
+  {
+    "url": "https://openreview.net/forum?id=QcqsxI6rKDs",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.62",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Meta-optimization is an effective approach that learns a shared set of parameters across tasks for parameter initialization in meta-learning.\nA key challenge for meta-optimization based approaches is to determine whether an initialization condition can be generalized to tasks with diverse distributions to accelerate learning. \nTo address this issue, we design a meta-gradient boosting framework that uses a base learner to learn shared information across tasks and a series of gradient-boosted modules to capture task-specific information to fit diverse distributions.\nWe evaluate the proposed model on both regression and classification tasks with multi-mode distributions. \nThe results demonstrate both the effectiveness of our model in modulating task-specific meta-learned priors and its advantages on multi-mode distributions.",
+    "title": "Meta Gradient Boosting Neural Networks",
+    "authors": [
+      "Manqing Dong",
+      "Lina Yao",
+      "Xianzhi Wang",
+      "Xiwei Xu",
+      "Liming Zhu"
+    ],
+    "emails": [
+      "uts.edu.au",
+      "~Manqing_Dong1",
+      "~Lina_Yao2",
+      "data61.csiro.au"
+    ],
+    "rank": 2178
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ifvv3qpPsqB",
+    "ratings": [
+      5,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.62",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Adversarial training is promising for improving the robustness of deep neural networks towards adversarial perturbations, especially on the classification task. The effect of this type of training on semantic segmentation, contrarily, just commences. We make the initial attempt to explore the defense strategy on semantic segmentation by formulating a general adversarial training procedure that can perform decently on both adversarial and clean samples. We propose a dynamic divide-and-conquer adversarial training (DDC-AT) strategy to enhance the defense effect, by setting additional branches in the target model during training, and dealing with pixels with diverse properties towards adversarial perturbation. Our dynamical division mechanism divides pixels into multiple branches automatically, achieved by unsupervised learning. Note all these additional branches can be abandoned during inference and thus leave no extra parameter and computation cost. Extensive experiments with various segmentation models are conducted on PASCAL VOC 2012 and Cityscapes datasets, in which DDC-AT yields satisfying performance under both white- and black-box attacks.",
+    "title": "Dynamic Divide-and-Conquer Adversarial Training for Robust Semantic Segmentation",
+    "authors": [
+      "Xiaogang Xu",
+      "Hengshuang Zhao",
+      "Jiaya Jia"
+    ],
+    "emails": [
+      "~Jiaya_Jia1",
+      "~Xiaogang_Xu2",
+      "~Hengshuang_Zhao2"
+    ],
+    "rank": 2179
+  },
+  {
+    "url": "https://openreview.net/forum?id=4IU_xHbLiH",
+    "ratings": [
+      5,
+      6,
+      3,
+      5
+    ],
+    "rating": "4.62",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Matrix inverse (Minv) and singular value decomposition (SVD) are among the most widely used matrix operations in massive data analysis, machine learning, and statistics.  Although well-studied, they still encounter difficulties in practical use due to inefficiency and non-differentiability.  In this paper, we aim at solving efficiency and differentiability issues through learning-based methods.  First of all, to perform matrix inverse, we provide a differentiable yet efficient way, named LD-Minv, which is a learnable deep neural network (DNN) with each layer being an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container>-th order matrix polynomial. We show that, with proper initialization, the difference between LD-Minv's output and exact pseudo-inverse is in the order <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>e</mi><mi>x</mi><mi>p</mi><mrow><mo>\u2212</mo><msup><mi>L</mi><mi>K</mi></msup></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> is the depth of the LD-Minv. Moreover, by learning from data, LD-Minv further reduces the difference between the output and the exact pseudo-inverse. We prove that gradient descent finds an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-error minimum within <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mi>n</mi><mi>K</mi><mi>L</mi><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>\u03f5</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> steps for LD-Minv, where n is the data size.  At last, we provide the generalization bound for LD-Minv in both under-parameterized and over-parameterized settings. As an application of LD-Minv, we provide a learning-based optimization method to solve the problem with orthogonality constraints and utilize it to differentiate SVD (D-SVD). We also offer a theoretical generalization guarantee for D-SVD. Finally, we demonstrate the superiority of our methods on the synthetic and real data in the supplementary materials.",
+    "title": "Fast and Differentiable Matrix Inverse and Its Extension to SVD",
+    "authors": [
+      "Xingyu Xie",
+      "Hao Kong",
+      "Jianlong Wu",
+      "Guangcan Liu",
+      "Zhouchen Lin"
+    ],
+    "emails": [
+      "~Jianlong_Wu1",
+      "pku.edu.cn",
+      "~Zhouchen_Lin1",
+      "~Xingyu_Xie1",
+      "~Guangcan_Liu3"
+    ],
+    "rank": 2180
+  },
+  {
+    "url": "https://openreview.net/forum?id=uKZsVyFKbaj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      6
+    ],
+    "rating": "4.62",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Efforts to improve the learning abilities of neural networks have focused mostly on the role of optimization methods rather than on weight initializations. Recent findings, however, suggest that neural networks rely on lucky random initial weights of subnetworks called \"lottery tickets\" that converge quickly to a solution. To investigate how weight initializations affect performance, we examine small convolutional networks that are trained to predict <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> steps of the two-dimensional cellular automaton Conway\u2019s Game of Life, the update rules of which can be implemented efficiently in a small CNN. We find that networks of this architecture trained on this task rarely converge. Rather, networks require substantially more parameters to consistently converge. Furthermore, we find that the initialization parameters that gradient descent converges to a solution are sensitive to small perturbations, such as a single sign change. Finally, we observe a critical value <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>d</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> such that training minimal networks with examples in which cells are alive with probability <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>d</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> dramatically increases the chance of convergence to a solution. Our results are consistent with the lottery ticket hypothesis.",
+    "title": "It's Hard for Neural Networks to Learn the Game of Life",
+    "authors": [
+      "Jacob M. Springer",
+      "Garrett T. Kenyon"
+    ],
+    "emails": [
+      "~Garrett_T._Kenyon1",
+      "~Jacob_M._Springer1"
+    ],
+    "rank": 2181
+  },
+  {
+    "url": "https://openreview.net/forum?id=-csYGiUuGlt",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      8,
+      7,
+      3
+    ],
+    "rating": "4.61",
+    "confidences": [
+      5,
+      5,
+      4,
+      1,
+      3
+    ],
+    "abstract": "Adaptive gradient methods including Adam, AdaGrad, and their variants have been very successful for training deep learning models, such as neural networks, in the past few years. Meanwhile, given the need for distributed training procedures, distributed optimization algorithms are at the center of attention. With the growth of computing power and the need for using machine learning models on mobile devices, the communication cost of distributed training algorithms needs careful consideration. In that regard, more and more attention is shifted from the traditional parameter server training paradigm to the decentralized one, which usually requires lower communication costs. In this paper, we rigorously incorporate adaptive gradient methods into decentralized training procedures and introduce novel convergent decentralized adaptive gradient methods. Specifically, we propose a general algorithmic framework that can convert existing adaptive gradient methods to their decentralized counterparts. In addition, we thoroughly analyze the convergence behavior of the proposed algorithmic framework and show that if a given adaptive gradient method converges, under some specific conditions, then its decentralized counterpart is also convergent. ",
+    "title": "Convergent Adaptive Gradient Methods in Decentralized Optimization",
+    "authors": [
+      "Xiangyi Chen",
+      "Belhal Karimi",
+      "Weijie Zhao",
+      "Ping Li"
+    ],
+    "emails": [
+      "~Ping_Li3",
+      "baidu.com",
+      "~Xiangyi_Chen1",
+      "~Belhal_Karimi1"
+    ],
+    "rank": 2182
+  },
+  {
+    "url": "https://openreview.net/forum?id=HO80-Z4l0M",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      8,
+      3
+    ],
+    "rating": "4.61",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Deep learning classification models typically train poorly on classes with small numbers of examples. Motivated by the human ability to solve this task, models have been developed that transfer knowledge from classes with many examples to learn classes with few examples. Critically, the majority of these models transfer knowledge within model feature space. In this work, we demonstrate that transferring knowledge within classifier space is more effective and efficient. Specifically, by linearly combining strong nearest neighbor classifiers along with a weak classifier, we are able to compose a stronger classifier. Uniquely, our model can be implemented on top of any existing classification model that includes a classifier layer. We showcase the success of our approach in the task of long-tailed recognition, whereby the classes with few examples, otherwise known as the tail classes, suffer the most in performance and are the most challenging classes to learn. Using classifier-level knowledge transfer, we are able to drastically improve - by a margin as high as 10.5% - the state-of-the-art performance on the tail categories.",
+    "title": "Alpha Net: Adaptation with Composition in Classifier Space",
+    "authors": [
+      "Nadine Chang",
+      "Jayanth Koushik",
+      "Michael Tarr",
+      "Martial Hebert",
+      "Yu-Xiong Wang"
+    ],
+    "emails": [
+      "~Nadine_Chang1",
+      "~Martial_Hebert1",
+      "~Michael_Tarr1",
+      "~Yu-Xiong_Wang1",
+      "~Jayanth_Koushik1"
+    ],
+    "rank": 2183
+  },
+  {
+    "url": "https://openreview.net/forum?id=padYzanQNbg",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.60",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Several authors have introduced \\emph{Neural Stochastic Differential Equations} (Neural SDEs), often involving complex theory with various limitations. Here, we aim to introduce a generic, user friendly approach to neural SDEs. Our central contribution is the observation that an SDE is a map from Wiener measure (Brownian motion) to a solution distribution, which may be sampled from, but which does not admit a straightforward notion of probability density -- and that this is just the familiar formulation of a GAN. This produces a continuous-time generative model, arbitrary drift and diffusions are admissible, and in the infinite data limit any SDE may be learnt. After that, we construct a new scheme for sampling \\emph{and reconstructing} Brownian motion, with constant average-case time and memory costs, adapted to the access patterns of an SDE solver. Finally, we demonstrate that the adjoint SDE (used for backpropagation) may be constructed via rough path theory, without the previous theoretical complexity of two-sided filtrations.",
+    "title": "Neural SDEs Made Easy: SDEs are Infinite-Dimensional GANs",
+    "authors": [
+      "Patrick Kidger",
+      "James Foster",
+      "Xuechen Li",
+      "Harald Oberhauser",
+      "Terry Lyons"
+    ],
+    "emails": [
+      "~James_Foster4",
+      "maths.ox.ac.uk",
+      "~Xuechen_Li1",
+      "~Patrick_Kidger1"
+    ],
+    "rank": 2184
+  },
+  {
+    "url": "https://openreview.net/forum?id=xW9zZm9qK0_",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      6,
+      6,
+      5
+    ],
+    "rating": "4.60",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Label noise is ubiquitous in the era of big data. Deep learning algorithms can easily fit the noise and thus cannot generalize well without properly modeling the noise. In this paper, we propose a new perspective on dealing with  label noise called ``\\textit{Class2Simi}''. Specifically, we transform the training examples with noisy class labels into pairs of examples with noisy similarity labels, and propose a deep learning framework to learn robust classifiers with the noisy similarity labels. Note that a class label shows the class that an instance belongs to; while a similarity label indicates whether or not two instances belong to the same class. It is worthwhile to perform the transformation: We prove that the noise rate for the noisy similarity labels is lower than that of the noisy class labels, because similarity labels themselves are robust to noise. For example, given two instances, even if both of their class labels are incorrect, their similarity label could be correct. Due to the lower noise rate, Class2Simi achieves remarkably better classification accuracy than its baselines that directly deals with the noisy class labels.",
+    "title": "Class2Simi: A New Perspective on Learning with Label Noise",
+    "authors": [
+      "Songhua Wu",
+      "Xiaobo Xia",
+      "Tongliang Liu",
+      "Bo Han",
+      "Mingming Gong",
+      "Nannan Wang",
+      "Haifeng Liu",
+      "Gang Niu"
+    ],
+    "emails": [
+      "~Mingming_Gong1",
+      "~Nannan_Wang1",
+      "~Songhua_Wu1",
+      "~Xiaobo_Xia1",
+      "leinao.ai",
+      "~Bo_Han1",
+      "~Gang_Niu1",
+      "~Tongliang_Liu1"
+    ],
+    "rank": 2185
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZN3s7fN-bo",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.60",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Visualization tools for supervised learning (SL) allow users to interpret, introspect, and gain an intuition for the successes and failures of their models. While reinforcement learning (RL) practitioners ask many of the same questions while debugging agent policies, existing tools aren't a great fit for the RL setting as these tools address challenges typically found in the SL regime. Whereas SL involves a static dataset, RL often entails collecting new data in challenging environments with partial observability, stochasticity, and non-stationary data distributions. This necessitates the creation of alternate visual interfaces to help us better understand agent policies trained using RL. In this work, we design and implement an interactive visualization tool for debugging and interpreting RL. Our system identifies and addresses important aspects missing from existing tools such as (1) visualizing alternate state representations (different from those seen by the agent) that researchers could use while debugging RL policies; (2) interactive interfaces tailored to metadata stored while training RL agents (3) a conducive workflow designed around RL policy debugging. We provide an example workflow of how this system could be used, along with ideas for future extensions.",
+    "title": "Interactive Visualization for Debugging RL",
+    "authors": [
+      "Shuby Deshpande",
+      "Benjamin Eysenbach",
+      "Jeff Schneider"
+    ],
+    "emails": [
+      "~Shuby_Deshpande1",
+      "~Benjamin_Eysenbach1",
+      "~Jeff_Schneider1"
+    ],
+    "rank": 2186
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ms51cV-vqFY",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.60",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Deep learning based semi-supervised learning (SSL) algorithms have led to promising results in recent years. However, they tend to introduce multiple tunable hyper-parameters, making them less practical in real SSL scenarios where the labeled data is scarce for extensive hyper-parameter search. In this paper, we propose a novel meta-learning based SSL algorithm (Meta-Semi) that requires tuning only one additional hyper-parameter, compared with a standard supervised deep learning algorithm, to achieve competitive performance under various conditions of SSL. We start by de\ufb01ning a meta optimization problem that minimizes the loss on labeled data through dynamically reweighting the loss on unlabeled samples, which are associated with soft pseudo labels during training. As the meta problem is computationally intensive to solve directly, we propose an ef\ufb01cient algorithm to dynamically obtain the approximate solutions. We show theoretically that Meta-Semi converges to the stationary point of the loss function on labeled data under mild conditions. Empirically, Meta-Semi outperforms state-of-the-art SSL algorithms signi\ufb01cantly on the challenging semi-supervised CIFAR-100 and STL-10 tasks, and achieves competitive performance on CIFAR-10 and SVHN.",
+    "title": "Meta-Semi: A Meta-learning Approach for Semi-supervised Learning",
+    "authors": [
+      "Yulin Wang",
+      "Jiayi Guo",
+      "Shiji Song",
+      "Gao Huang"
+    ],
+    "emails": [
+      "~Yulin_Wang1",
+      "~Gao_Huang1",
+      "~Jiayi_Guo2",
+      "~Shiji_Song1"
+    ],
+    "rank": 2187
+  },
+  {
+    "url": "https://openreview.net/forum?id=zYmnBGOZtH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5
+    ],
+    "rating": "4.60",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Given a dataset <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></math></mjx-assistive-mml></mjx-container> with label noise, how do we learn its underlying noise model? If we assume that the label noise is instance-independent, then the noise model can be represented by a noise transition matrix <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container>. Recent work has shown that even without further information about any instances with correct labels, or further assumptions on the distribution of the label noise, it is still possible to estimate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container> while simultaneously learning a classifier from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></math></mjx-assistive-mml></mjx-container>. However, this presupposes that a good estimate of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container> requires an accurate classifier. In this paper, we show that high classification accuracy is actually not required for estimating <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container> well. We shall introduce an information-theoretic-based framework for estimating <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container> solely from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></math></mjx-assistive-mml></mjx-container> (without additional information or assumptions). At the heart of our framework is a discriminator that predicts whether an input dataset has maximum Shannon entropy, which shall be used on multiple new datasets <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.136em; margin-bottom: -0.531em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c5E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow><mo stretchy=\"false\">^</mo></mover></mrow></math></mjx-assistive-mml></mjx-container> synthesized from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></math></mjx-assistive-mml></mjx-container> via the insertion of additional label noise. We prove that our estimator for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container> is statistically consistent, in terms of dataset size, and the number of intermediate datasets <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.136em; margin-bottom: -0.531em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c5E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow><mo stretchy=\"false\">^</mo></mover></mrow></math></mjx-assistive-mml></mjx-container> synthesized from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></math></mjx-assistive-mml></mjx-container>. As a concrete realization of our framework, we shall incorporate local intrinsic dimensionality (LID) into the discriminator, and we show experimentally that with our LID-based discriminator, the estimation error for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"13\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>Q</mi><mrow><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></mrow></msub></math></mjx-assistive-mml></mjx-container> can be significantly reduced. We achieved average Kullback--Leibler loss reduction from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"14\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.27</mn></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"15\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.17</mn></math></mjx-assistive-mml></mjx-container> for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"16\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>40</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> anchor-like samples removal when evaluated on the CIFAR10 with symmetric noise. Although no clean subset of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"17\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c44 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">D</mi></mrow></math></mjx-assistive-mml></mjx-container> is required for our framework to work, we show that our framework can also take advantage of clean data to improve upon existing estimation methods.",
+    "title": "An information-theoretic framework for learning models of instance-independent label noise",
+    "authors": [
+      "Xia Huang",
+      "Kai Fong Ernest Chong"
+    ],
+    "emails": [
+      "~Kai_Fong_Ernest_Chong1",
+      "~Xia_Huang1"
+    ],
+    "rank": 2188
+  },
+  {
+    "url": "https://openreview.net/forum?id=B9nDuDeanHK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3,
+      6
+    ],
+    "rating": "4.60",
+    "confidences": [
+      4,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Binary Weight Networks (BWNs) have significantly lower computational and memory costs compared to their full-precision counterparts. To address the non-differentiable issue of BWNs, existing methods usually use the Straight-Through-Estimator (STE). In the optimization, they learn optimal binary weight outputs represented as a combination of scaling factors and weight signs to approximate 32-bit floating-point weight values, usually with a layer-wise quantization scheme. In this paper, we begin with an empirical study of training BWNs with STE under the settings of using common techniques and tricks. We show that in the context of using batch normalization after convolutional layers, adapting scaling factors with either hand-crafted or learnable methods brings marginal or no accuracy gain to final model, while the change of weight signs is crucial in the training of BWNs. Furthermore, we observe two astonishing training phenomena. Firstly, the training of BWNs demonstrates the process of seeking primary binary sub-networks whose weight signs are determined and fixed at the early training stage, which is akin to recent findings on the lottery ticket hypothesis for efficient learning of sparse neural networks. Secondly, we find binary kernels in the convolutional layers of final models tend to be centered on a limited number of the most frequent binary kernels, showing binary weight networks may has the potential to be further compressed, which breaks the common wisdom that representing each weight with a single bit puts the quantization to the extreme compression. To testify this hypothesis, we additionally propose a binary kernel quantization method, and we call resulting models Quantized Binary-Kernel Networks (QBNs). We hope these new experimental observations would shed new design insights to improve the training and broaden the usages of BWNs.",
+    "title": "Weights Having Stable Signs Are Important: Finding Primary Subnetworks and Kernels to Compress Binary Weight Networks",
+    "authors": [
+      "Zhaole Sun",
+      "Anbang Yao"
+    ],
+    "emails": [
+      "~Zhaole_Sun1",
+      "~Anbang_Yao1"
+    ],
+    "rank": 2189
+  },
+  {
+    "url": "https://openreview.net/forum?id=dkcJ_pwHW3",
+    "ratings": [
+      5,
+      5,
+      4
+    ],
+    "rating": "4.60",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "As deep neural networks are growing in size and being increasingly deployed to more resource-limited devices, there has been a recent surge of interest in network pruning methods, which aim to remove less important weights or activations of a given network. A common limitation of most existing pruning techniques, is that they require pre-training of the networks at least once before pruning, and thus we can benefit from reduction in memory and computation only at the inference time. However, reducing the training cost of neural networks with rapid structural pruning may be beneficial either to minimize monetary cost with cloud computing or to enable on-device learning on a resource-limited device. Recently introduced random-weight pruning approaches can eliminate the needs of pretraining, but they often obtain suboptimal performance over conventional pruning techniques and also does not allow for faster training since they perform unstructured pruning. To overcome their limitations, we propose Set-based Task-Adaptive Meta Pruning (STAMP), which task-adaptively prunes a network pretrained on a large reference dataset by generating a pruning mask on it as a function of the target dataset. To ensure maximum performance improvements on the target task, we meta-learn the mask generator over different subsets of the reference dataset, such that it can generalize well to any unseen datasets within a few gradient steps of training. We validate STAMP against recent advanced pruning methods on benchmark datasets, on which it not only obtains significantly improved compression rates over the baselines at similar accuracy, but also orders of magnitude faster training speed.",
+    "title": "Rapid Neural Pruning for Novel Datasets with Set-based Task-Adaptive Meta-Pruning",
+    "authors": [
+      "Minyoung Song",
+      "Jaehong Yoon",
+      "Eunho Yang",
+      "Sung Ju Hwang"
+    ],
+    "emails": [
+      "~Eunho_Yang1",
+      "~Sung_Ju_Hwang1",
+      "~Minyoung_Song1",
+      "~Jaehong_Yoon1"
+    ],
+    "rank": 2190
+  },
+  {
+    "url": "https://openreview.net/forum?id=gYbimGJAENn",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.60",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We propose a simple architecture to address unpaired image-to-image  translation tasks: style or class transfer, denoising, deblurring, deblocking, etc. \nWe start from an image autoencoder architecture with fixed weights. \nFor each task we learn a residual block operating in the latent space, which is iteratively called until the target domain is reached. \nA specific training schedule is required to alleviate the exponentiation effect of the iterations. \nAt test time, it offers several advantages: the number of weight parameters is limited and the compositional design allows one to modulate the strength of the transformation with the number of iterations. \nThis is useful, for instance, when the type or amount of noise to suppress is not known in advance.  \nExperimentally, we show that the performance of our model is comparable or better than CycleGAN and Nice-GAN with fewer parameters.",
+    "title": "Powers of layers for image-to-image translation",
+    "authors": [
+      "Hugo Touvron",
+      "Matthijs Douze",
+      "Matthieu Cord",
+      "Herve Jegou"
+    ],
+    "emails": [
+      "~Matthijs_Douze1",
+      "~Matthieu_Cord1",
+      "~Hugo_Touvron1",
+      "~Herve_Jegou1"
+    ],
+    "rank": 2191
+  },
+  {
+    "url": "https://openreview.net/forum?id=NYLvNv8q4i",
+    "ratings": [
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.60",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "This work studies intragroup sparsity, a fine-grained structural constraint on network weight parameters. It eliminates the computational inefficiency of fine-grained sparsity due to irregular dataflow, while at the same time achieving high inference accuracy. We present theoretical analysis on how weight group sizes affect sparsification error, and on how the performance of pruned networks changes with sparsity level. Further, we analyze inference-time I/O cost of two different strategies for achieving intragroup sparsity and how the choice of strategies affect I/O cost under mild assumptions on accelerator architecture. Moreover, we present a novel training algorithm that yield models of improved accuracies over the standard training approach under the intragroup sparsity constraint.",
+    "title": "Intragroup sparsity for efficient inference",
+    "authors": [
+      "Zilin Yu",
+      "Chao Wang",
+      "Xin Wang",
+      "Yong Zhao",
+      "Xundong Wu"
+    ],
+    "emails": [
+      "~Yong_Zhao4",
+      "~Xundong_Wu3",
+      "~Zilin_Yu1",
+      "~Chao_Wang1",
+      "~Xin_Wang2"
+    ],
+    "rank": 2192
+  },
+  {
+    "url": "https://openreview.net/forum?id=HCa8gC_COVk",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      5
+    ],
+    "rating": "4.60",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Deep generative models are generally categorized into explicit models and implicit models. The former defines an explicit density form that allows likelihood inference; while the latter targets a flexible transformation from random noise to generated samples. To take full advantages of both models, we propose Stein Bridging, a novel joint training framework that connects an explicit (unnormalized) density estimator and an implicit sample generator via Stein discrepancy. We show that the Stein bridge 1) induces novel mutual regularization via kernel Sobolev norm penalization and Moreau-Yosida regularization, and 2) stabilizes the training dynamics. Empirically, we demonstrate that Stein Bridging can facilitate the density estimator to accurately identify data modes and guide the sample generator to output more high-quality samples especially when the training samples are contaminated or limited.",
+    "title": "Mutual Calibration between Explicit and Implicit Deep Generative Models",
+    "authors": [
+      "Qitian Wu",
+      "Rui Gao",
+      "Hongyuan Zha"
+    ],
+    "emails": [
+      "~Hongyuan_Zha1",
+      "~Rui_Gao3",
+      "~Qitian_Wu1"
+    ],
+    "rank": 2193
+  },
+  {
+    "url": "https://openreview.net/forum?id=mnj-9lYJgu",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.59",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "We introduce Deep Adaptive Semantic Logic (DASL), a novel framework for automating the generation of deep neural networks that incorporates user-provided formal knowledge to improve learning from data. We provide formal semantics that demonstrate that our knowledge representation captures all of first order logic and that finite sampling from infinite domains converges to correct truth values. DASL\u2019s representation improves on prior neuro-symbolic work by avoiding vanishing gradients, allowing deeper logical structure, and enabling richer interactions between the knowledge and learning components. We illustrate DASL through a toy problem in which we add structure to an image classification problem and demonstrate that knowledge of that structure\nreduces data requirements by a factor of 1000. We apply DASL on a visual relationship detection task and demonstrate that the addition of commonsense knowledge improves performance by 10.7% in a data scarce setting.",
+    "title": "DEEP ADAPTIVE SEMANTIC LOGIC (DASL): COMPILING DECLARATIVE KNOWLEDGE INTO DEEP NEURAL NETWORKS",
+    "authors": [
+      "Karan Sikka",
+      "Andrew Silberfarb",
+      "John Byrnes",
+      "Indranil Sur",
+      "Ed Chow",
+      "Ajay Divakaran",
+      "Richard Rohwer"
+    ],
+    "emails": [
+      "~Karan_Sikka1",
+      "~Ajay_Divakaran1",
+      "sri.com",
+      "~Indranil_Sur1"
+    ],
+    "rank": 2194
+  },
+  {
+    "url": "https://openreview.net/forum?id=TmUfsLjI-1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Transfer learning has been recently popularized as a data-efficient alternative to training models from scratch, in particular in vision and NLP where it provides a remarkably solid baseline. The emergence of rich model repositories, such as TensorFlow Hub, enables the practitioners and researchers to unleash the potential of these models across a wide range of downstream tasks. As these repositories keep growing exponentially, efficiently selecting a good model for the task at hand becomes paramount. We provide a formalization of this problem through a familiar notion of regret and introduce the predominant strategies, namely task-agnostic (e.g. picking the highest scoring ImageNet model) and task-aware search strategies (such as linear or kNN evaluation). We conduct a large-scale empirical study and show that both task-agnostic and task-aware methods can yield high regret. We then propose a simple and computationally efficient hybrid search strategy which outperforms the existing approaches. We highlight the practical benefits of the proposed solution on a set of 19 diverse vision tasks.",
+    "title": "Which Model to Transfer? Finding the Needle in the Growing Haystack",
+    "authors": [
+      "Cedric Renggli",
+      "Andr\u00e9 Susano Pinto",
+      "Luka Rimanic",
+      "Joan Puigcerver",
+      "Carlos Riquelme Ruiz",
+      "Ce Zhang",
+      "Mario Lucic"
+    ],
+    "emails": [
+      "~Mario_Lucic1",
+      "~Cedric_Renggli1",
+      "inf.ethz.ch",
+      "~Joan_Puigcerver1",
+      "~Andr\u00e9_Susano_Pinto1",
+      "~Ce_Zhang1",
+      "~Carlos_Riquelme_Ruiz1"
+    ],
+    "rank": 2195
+  },
+  {
+    "url": "https://openreview.net/forum?id=_DVn-4CoCty",
+    "ratings": [
+      5,
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      4,
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "We present a ``learning to learn'' approach for automatically constructing white-box classification loss functions that are robust to label noise in the training data. We paramaterize a flexible family of loss functions using Taylor polynomials, and apply evolutionary strategies to search for noise-robust losses in this space. To learn re-usable loss functions that can apply to new tasks, our fitness function scores their performance in aggregate across a range of training datasets and architecture combinations. The resulting white-box loss provides a simple and fast ``plug-and-play'' module that enables effective noise-robust learning in diverse downstream tasks, without requiring a special training procedure or network architecture.",
+    "title": "Searching for Robustness: Loss Learning for Noisy Classification Tasks",
+    "authors": [
+      "Boyan Gao",
+      "Henry Gouk",
+      "Timothy Hospedales"
+    ],
+    "emails": [
+      "~Henry_Gouk1",
+      "~Boyan_Gao1",
+      "~Timothy_Hospedales1"
+    ],
+    "rank": 2196
+  },
+  {
+    "url": "https://openreview.net/forum?id=WJfIKDt8d2f",
+    "decision": "Concerns raised (can publish with adjustment)",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.59",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "To ensure the wide adoption and safety of autonomous driving, the vehicles need to be able to drive under various lighting, weather, and visibility conditions in different environments. These external and environmental factors, along with internal factors associated with sensors, can pose significant challenges to perceptual data processing, hence affecting the decision-making of the vehicle. In this work, we address this critical issue by analyzing the sensitivity of the learning algorithm with respect to varying quality in the image input for autonomous driving.  Using the results of sensitivity analysis, we further propose an algorithm to improve the overall performance of the task of ``learning to steer''. The results show that our approach is able to enhance the learning outcomes up to 48%. A comparative study drawn between our approach and other related techniques, such as data augmentation and adversarial training, confirms the effectiveness of our algorithm as a way to improve the robustness and generalization of neural network training for self-driving cars.    ",
+    "title": "Driving through the Lens: Improving Generalization of Learning-based Steering using Simulated Adversarial Examples",
+    "authors": [
+      "Yu Shen",
+      "Laura Yu Zheng",
+      "Manli Shu",
+      "Weizi Li",
+      "Tom Goldstein",
+      "Ming Lin"
+    ],
+    "emails": [
+      "~Laura_Yu_Zheng1",
+      "~Weizi_Li1",
+      "~Yu_Shen1",
+      "~Tom_Goldstein1",
+      "~Ming_Lin2",
+      "~Manli_Shu1"
+    ],
+    "rank": 2197
+  },
+  {
+    "url": "https://openreview.net/forum?id=rmd-D7h_2zP",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Dialogue state tracking for multi-domain dialogues is challenging because the model should be able to track dialogue states across multiple domains and slots. Past studies had its limitations in that they did not factor in the relationship among different domain-slot pairs. Although recent approaches did support relationship modeling among the domain-slot pairs, they did not leverage a pre-trained language model, which has improved the performance of numerous natural language tasks, in the encoding process. Our approach fills the gap between these previous studies. We propose a model for multi-domain dialogue state tracking that effectively models the relationship among domain-slot pairs using a pre-trained language encoder. Inspired by the way the special <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c5B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D436 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c5D\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">[</mo><mi>C</mi><mi>L</mi><mi>S</mi><mo stretchy=\"false\">]</mo></math></mjx-assistive-mml></mjx-container> token in BERT is used to aggregate the information of the whole sequence, we use multiple special tokens for each domain-slot pair that encodes information corresponding to its domain and slot. The special tokens are run together with the dialogue context through the pre-trained language encoder, which effectively models the relationship among different domain-slot pairs. Our experimental results show that our model achieves state-of-the-art performance on the MultiWOZ-2.1 and MultiWOZ-2.2 dataset.",
+    "title": "Domain-slot Relationship Modeling using  a Pre-trained Language Encoder for Multi-Domain Dialogue State Tracking",
+    "authors": [
+      "Jinwon An",
+      "Misuk Kim",
+      "Sungzoon Cho",
+      "Junseong Bang"
+    ],
+    "emails": [
+      "etri.re.kr",
+      "~Misuk_Kim1",
+      "~Sungzoon_Cho1",
+      "~Jinwon_An1"
+    ],
+    "rank": 2198
+  },
+  {
+    "url": "https://openreview.net/forum?id=5IqTrksw9S",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "abstract": "A multitude of machine learning models for source code have been proposed in the recent years capturing various aspects of the inherent rich structure and semantics of code. However, these models are commonly designed to perform well on a single task, failing to capture code's  multifaceted nature. To address this, we present GLUECode, Global and Local Understanding Evaluation of Code, a benchmark of diverse tasks to evaluate machine learning models of source code.  \n\nCrucially, GLUECode accounts for the distinct characteristics of source code: (1) source code is highly structured and (2) source code is often composed of multiple interacting entities. Existing tasks incentivize researchers to create models and code representations that perform well on a single task - commonly focusing on local reasoning. GLUECode aims to allow researchers to experiment with multiple local and global source code representations, and evaluate these models on their ability to capture the diverse characteristics of source code, thus driving the community towards building robust source code models incorporating global reasoning. \n\nWe present results for several baselines. The GLUECode tasks are challenging for the evaluated baselines; no model achieves convincing performance across all tasks. This indicates that there is ample room for progress on GLUECode.",
+    "title": "GLUECode: A Benchmark for Source Code Machine Learning Models",
+    "authors": [
+      "Anjan Karmakar",
+      "Julian Aron Prenner",
+      "Miltiadis Allamanis",
+      "Romain Robbes"
+    ],
+    "emails": [
+      "unibz.it",
+      "~Anjan_Karmakar1",
+      "~Miltiadis_Allamanis1"
+    ],
+    "rank": 2199
+  },
+  {
+    "url": "https://openreview.net/forum?id=uHNEe2aR4qJ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.59",
+    "confidences": [
+      4,
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Negative pretraining is a prominent sequential learning effect of neural networks where a pretrained model obtains a worse generalization performance than a model that is trained from scratch when either are trained on a target task. We conceptualize the ingredients of this problem setting and examine the negative pretraining effect experimentally by providing three interventions to remove and fix it. First, acting on the learning process, altering the learning rate after pretraining can yield even better results than training directly on the target task. Second, on the learning task-level, we intervene by increasing the discretization of data distribution changes from start to target task instead of \u201cjumping\u201d to a target task. Finally at the model-level, resetting network biases to larger values likewise removes negative pretraining effects, albeit to a smaller degree. With these intervention experiments, we aim to provide new evidence to help understand the subtle influences that neural network training and pretraining can have on final generalization performance on a target task in the context of negative pretraining. ",
+    "title": "The Negative Pretraining Effect in Sequential Deep Learning and Three Ways to Fix It",
+    "authors": [
+      "Julian G. Zilly",
+      "Franziska Eckert",
+      "Bhairav Mehta",
+      "Andrea Censi",
+      "Emilio Frazzoli"
+    ],
+    "emails": [
+      "~Julian_G._Zilly1",
+      "~Bhairav_Mehta1",
+      "~Andrea_Censi1",
+      "~Emilio_Frazzoli1",
+      "~Franziska_Eckert1"
+    ],
+    "rank": 2200
+  },
+  {
+    "url": "https://openreview.net/forum?id=o6ndFLB1DST",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Counterfactual explanations for machine learning models are used to find minimal interventions to the feature values such that the model changes the prediction to a different output or a target output. A valid counterfactual explanation should have likely feature values. Here, we address the challenge of generating counterfactual explanations that lie in the same data distribution as that of the training data and more importantly, they belong to the  target class distribution. This requirement has been addressed through the incorporation of auto-encoder reconstruction loss in the counterfactual search process.  Connecting the output behavior of the classifier to the latent space of the auto-encoder has further improved the speed of the counterfactual search process and the interpretability of the resulting counterfactual explanations. Continuing this line of research, we show further improvement in the interpretability of counterfactual explanations when the auto-encoder is trained in a semi-supervised fashion with class tagged input data. We empirically evaluate our approach on several datasets and show considerable improvement in-terms of several metrics.",
+    "title": "Semi-supervised counterfactual explanations",
+    "authors": [
+      "SURYA SHRAVAN KUMAR SAJJA",
+      "Sumanta Mukherjee",
+      "Satyam Dwivedi",
+      "Vikas C. Raykar"
+    ],
+    "emails": [
+      "~Sumanta_Mukherjee1",
+      "~Vikas_C._Raykar1",
+      "~SURYA_SHRAVAN_KUMAR_SAJJA1",
+      "in.ibm.com"
+    ],
+    "rank": 2201
+  },
+  {
+    "url": "https://openreview.net/forum?id=4JLiaohIk9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Motion forecasting is essential for making safe and intelligent decisions in robotic applications such as autonomous driving. State-of-the-art methods formulate it as a sequence-to-sequence prediction problem, which is solved in an encoder-decoder framework with a maximum likelihood estimation objective. In this paper, we show that the likelihood objective itself results in a model assigning too much probability to trajectories that are unlikely given the contextual information such as maps and states of surrounding agents. This is despite the fact that many state-of-the-art models do take contextual information as part of their input. We propose a new objective, unlikelihood training, which forces generated trajectories that conflicts with contextual information to be assigned a lower probability by our model. We demonstrate that our method can significantly improve state-of-art models\u2019 performance on challenging real-world trajectory forecasting datasets (nuScenes and Argoverse) by 8% and reduce the standard deviation by up to 50%. The code will be made available.",
+    "title": "Motion Forecasting with Unlikelihood Training",
+    "authors": [
+      "Deyao Zhu",
+      "Mohamed Zahran",
+      "Li Erran Li",
+      "Mohamed Elhoseiny"
+    ],
+    "emails": [
+      "~Mohamed_Zahran1",
+      "~Mohamed_Elhoseiny1",
+      "~Li_Erran_Li1",
+      "~Deyao_Zhu1"
+    ],
+    "rank": 2202
+  },
+  {
+    "url": "https://openreview.net/forum?id=hBxSksqPuOg",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.59",
+    "confidences": [
+      3,
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Generative models are increasingly able to produce remarkably high quality images and text.  The community has developed numerous evaluation metrics for comparing generative models.  However, these metrics do not effectively quantify data diversity.  We develop a new diversity metric that can readily be applied to data, both synthetic and natural, of any type.  Our method employs random network distillation, a technique introduced in reinforcement learning.  We validate and deploy this metric on both images and text.  We further explore diversity in few-shot image generation, a setting which was previously difficult to evaluate.",
+    "title": "Random Network Distillation as a Diversity Metric for Both Image and Text Generation",
+    "authors": [
+      "Liam H Fowl",
+      "Micah Goldblum",
+      "Arjun Gupta",
+      "Amr Sharaf",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Liam_H_Fowl1",
+      "~Amr_Sharaf1",
+      "~Arjun_Gupta2",
+      "~Micah_Goldblum1",
+      "~Tom_Goldstein1"
+    ],
+    "rank": 2203
+  },
+  {
+    "url": "https://openreview.net/forum?id=DC1Im3MkGG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.58",
+    "confidences": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Standard learning approaches are designed to perform well on average for the data distribution available at training time. Developing learning approaches that are not overly sensitive to the training distribution is central to research on domain- or out-of-distribution generalization, robust optimization and fairness. In this work we focus on links between research on domain generalization and algorithmic fairness---where performance under a distinct but related test distributions is studied---and show how the two fields can be mutually beneficial. While domain generalization methods typically rely on knowledge of disjoint \"domains\" or \"environments\", \"sensitive\" label information indicating which demographic groups are at risk of discrimination is often used in the fairness literature. Drawing inspiration from recent fairness approaches that improve worst-case performance without knowledge of sensitive groups, we propose a novel domain generalization method that handles the more realistic scenario where environment partitions are not provided. We then show theoretically and empirically how different partitioning schemes can lead to increased or decreased generalization performance, enabling us to outperform Invariant Risk Minimization with handcrafted environments in multiple cases. We also show how a re-interpretation of IRMv1 allows us for the first time to directly optimize a common fairness criterion, group-sufficiency, and thereby improve performance on a fair prediction task.\n",
+    "title": "Exchanging Lessons Between Algorithmic Fairness and Domain Generalization",
+    "authors": [
+      "Elliot Creager",
+      "Joern-Henrik Jacobsen",
+      "Richard Zemel"
+    ],
+    "emails": [
+      "~Elliot_Creager1",
+      "~Joern-Henrik_Jacobsen1",
+      "~Richard_Zemel1"
+    ],
+    "rank": 2204
+  },
+  {
+    "url": "https://openreview.net/forum?id=_cadenVdKzF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.58",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "For natural language processing (NLP) \u2018text-to-text\u2019 tasks, prevailing approaches heavily rely on pretraining large self-supervised models on massive external datasources.  However, this methodology is being critiqued for: exceptional compute and pretraining data requirements; diminishing returns on both large and small datasets; and importantly, favourable evaluation settings that overestimate performance differences.  The core belief behind current methodology, coined 'the bitter lesson' by R. Sutton, is that 'compute scale-up beats data and compute-efficient algorithms', neglecting that progress in compute hardware scale-up is based almost entirely on the miniaturisation of resource consumption.  We thus approach pretraining from a miniaturisation perspective, such as not to require massive external data sources and models, or learned translations from continuous input embeddings to discrete labels.  To minimise overly favourable evaluation, we examine learning on a long-tailed, low-resource, multi-label text classification dataset with noisy, highly sparse labels and many rare concepts.  To this end, we propose a novel 'dataset-internal' contrastive autoencoding approach to self-supervised pretraining and demonstrate marked improvements in zero-shot, few-shot and solely supervised learning performance; even under an unfavorable low-resource scenario, and without defaulting to large-scale external datasets for self-supervision.  We also find empirical evidence that zero and few-shot learning markedly benefit from adding more 'dataset-internal', self-supervised training signals, which is of practical importance when retrieving or computing on large external sources of such signals is infeasible.",
+    "title": "Self-supervised Contrastive Zero to Few-shot Learning from Small, Long-tailed Text data",
+    "authors": [
+      "Nils Rethmeier",
+      "Isabelle Augenstein"
+    ],
+    "emails": [
+      "~Isabelle_Augenstein1",
+      "~Nils_Rethmeier1"
+    ],
+    "rank": 2205
+  },
+  {
+    "url": "https://openreview.net/forum?id=PmUGXmOY1wK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4,
+      6,
+      5
+    ],
+    "rating": "4.58",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Graph-level representation learning plays a crucial role in a variety of tasks such as molecular property prediction and community analysis. Currently, several models based on mutual information maximization have shown strong performance on the task of unsupervised graph representation learning. In this paper, instead, we consider a disentanglement approach to learn graph-level representations in the unsupervised setting. Our work is the first to study disentanglement learning for graph-level representations. Our key observation is that the formation of many real-world graphs is a complex process with global and  local generative factors. We hypothesize that disentangled representations which capture these global and local generative factors into independent latent units can be highly beneficial. Specifically, for graph-level representation learning, our disentanglement approach can alleviate distraction due to local variations of individual nodes or individual local neighbourhoods. We propose a VAE based learning algorithm to disentangle the global graph-level information, which is common across the entire graph, and local patch-level information, which varies across individual patches (the local subgraphs centered around the nodes). Through extensive experiments and analysis, we show that our method achieves the state-of-the-art performance on the task of unsupervised graph representation learning.\n",
+    "title": "GL-Disen: Global-Local disentanglement for unsupervised learning of graph-level representations",
+    "authors": [
+      "Thilini Cooray",
+      "Ngai-man Cheung",
+      "Wei Lu"
+    ],
+    "emails": [
+      "~Thilini_Cooray1",
+      "~Ngai-man_Cheung1",
+      "~Wei_Lu4"
+    ],
+    "rank": 2206
+  },
+  {
+    "url": "https://openreview.net/forum?id=j7xc3_iqJt",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.58",
+    "confidences": [
+      4,
+      5,
+      3
+    ],
+    "abstract": "We introduce Mem2Mem, a memory-to-memory mechanism for hierarchical recurrent neural network based encoder decoder architectures and we explore its use for abstractive document summarization. Mem2Mem transfers memories via readable/writable external memory modules that augment both the encoder and decoder. Our memory regularization compresses an encoded input article into a more compact set of sentence representations. Most importantly, the memory compression step performs implicit extraction without labels, sidestepping issues with suboptimal ground-truth data and exposure bias of hybrid extractive-abstractive summarization techniques. By allowing the decoder to read/write over the encoded input memory, the model learns to read salient information about the input article while keeping track of what has been generated. Our Mem2Mem approach yields results that are competitive with state of the art transformer based summarization methods, but with 16 times fewer parameters.",
+    "title": "Mem2Mem: Learning to Summarize Long Texts with Memory Compression and Transfer",
+    "authors": [
+      "Jonathan Pilault",
+      "Jaehong Park",
+      "Christopher Pal"
+    ],
+    "emails": [
+      "~Jonathan_Pilault1",
+      "~Jaehong_Park2",
+      "~Christopher_Pal1"
+    ],
+    "rank": 2207
+  },
+  {
+    "url": "https://openreview.net/forum?id=m4baHw5LZ7M",
+    "decision": "Reject",
+    "ratings": [
+      9,
+      4,
+      4,
+      3
+    ],
+    "rating": "4.58",
+    "confidences": [
+      2,
+      2,
+      5,
+      3
+    ],
+    "abstract": "Solving the eigenvalue problem for differential operators is a common problem in many scientific fields. Classical numerical methods rely on intricate domain discretization, and yield non-analytic or non-smooth approximations. We introduce a novel Neural Network (NN)-based solver for the eigenvalue problem of differential self-adjoint operators where the eigenpairs are learned in an unsupervised end-to-end fashion. We propose three different training procedures, for solving increasingly challenging tasks towards the general eigenvalue problem. \n\nThe proposed solver is able to find the M smallest eigenpairs for a general differential operator. We demonstrate the method on the Laplacian operator which is of particular interest in image processing, computer vision, shape analysis among many other applications. \nUnlike other numerical methods such as finite differences, the partial derivatives of the network approximation of the eigenfunction can be analytically calculated to any order. Therefore, the proposed framework enables the solution of higher order operators and on free shape domain or even on a manifold. Non-linear operators can be investigated by this approach as well.",
+    "title": "Deep Learning Solution of the Eigenvalue Problem for Differential Operators",
+    "authors": [
+      "Ido Ben-Shaul",
+      "Leah Bar",
+      "Nir Sochen"
+    ],
+    "emails": [
+      "~Nir_Sochen1",
+      "~Leah_Bar2",
+      "~Ido_Ben-Shaul1"
+    ],
+    "rank": 2208
+  },
+  {
+    "url": "https://openreview.net/forum?id=x9C7Nlwgydy",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5
+    ],
+    "rating": "4.58",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Recent advances in deep clustering and unsupervised representation learning are based on the idea that different views of an input image (generated through data augmentation techniques) must either be closer in the representation space, or have a similar cluster assignment. In this work, we leverage this idea together with ensemble learning to perform clustering and representation learning. Ensemble learning is widely used in the supervised learning setting but has not yet been practical in deep clustering. Previous works on ensemble learning for clustering neither work on the feature space nor learn  features. We propose a novel ensemble learning algorithm dubbed Consensus Clustering with Unsupervised Representation Learning (ConCURL) which learns representations by creating a consensus on multiple clustering outputs. Specifically, we generate a cluster ensemble using random transformations on the embedding space, and define a consensus loss function that measures the disagreement among the constituents of the ensemble. Thus, diverse ensembles minimize this loss function in a synergistic way, which leads to better representations that work with all cluster ensemble constituents. Our proposed method ConCURL is easy to implement and integrate into any representation learning or deep clustering block. ConCURL outperforms all state of the art methods on various computer vision datasets. Specifically, we beat the closest state of the art method by 5.9 percent on the ImageNet-10 dataset, and by 18 percent on the ImageNet-Dogs dataset in terms of clustering accuracy. We further shed some light on the under-studied overfitting issue in clustering and show that our method does not overfit as much as existing methods, and thereby generalizes better for new data samples. ",
+    "title": "Consensus Clustering with Unsupervised Representation Learning",
+    "authors": [
+      "Jayanth Reddy Regatti",
+      "Aniket Anand Deshmukh",
+      "Eren Manavoglu",
+      "Urun Dogan"
+    ],
+    "emails": [
+      "~Urun_Dogan1",
+      "~Eren_Manavoglu1",
+      "~Jayanth_Reddy_Regatti1",
+      "~Aniket_Anand_Deshmukh1"
+    ],
+    "rank": 2209
+  },
+  {
+    "url": "https://openreview.net/forum?id=FVhZIBWqykk",
+    "ratings": [
+      6,
+      4,
+      6,
+      3
+    ],
+    "rating": "4.57",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Submodularity is a desirable property for a variety of objectives in content selection where the current neural encoder-decoder framework is inadequate. We define a class of novel attention mechanisms with submodular functions and in turn, prove the submodularity of the effective neural coverage. The resulting attention module offers an architecturally simple and empirically effective method to improve the coverage of neural text generation. We run experiments on three directed text generation tasks with different levels of recovering rate, across two modalities, three different neural model architectures and two training strategy variations. The results and analyses demonstrate that our method generalizes well across these settings, produces texts of good quality, outperforms comparable baselines and achieves state-of-the-art performance. ",
+    "title": "Resurrecting Submodularity for Neural Text Generation",
+    "authors": [
+      "SIMENG HAN",
+      "Xiang Lin",
+      "Shafiq Joty"
+    ],
+    "emails": [
+      "~SIMENG_HAN1",
+      "~Shafiq_Joty1",
+      "~Xiang_Lin2"
+    ],
+    "rank": 2210
+  },
+  {
+    "url": "https://openreview.net/forum?id=a4E6SL1rG3F",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Meta-learning models transfer the knowledge acquired from previous tasks to quickly learn new ones. They are tested on benchmarks with a fixed number of data-points for each training task, and this number is usually arbitrary, for example, 5 instances per class in few-shot classification. It is unknown how the performance of meta-learning is affected by the distribution of data across training tasks. Since labelling of data is expensive, finding the optimal allocation of labels across training tasks may reduce costs. \nGiven a fixed budget b of labels to distribute across tasks, should we use a small number of highly labelled tasks, or many tasks with few labels each? In MAML applied to mixed linear regression, we prove that the optimal number of tasks follows the scaling law sqrt{b}. We develop an online algorithm for data allocation across tasks, and show that the same scaling law applies to nonlinear regression. We also show preliminary experiments on few-shot image classification. Our work provides a theoretical guide for allocating labels across tasks in meta-learning, which we believe will prove useful in a large number of applications.\n",
+    "title": "Optimal allocation of data across training tasks in meta-learning",
+    "authors": [
+      "Georgios Batzolis",
+      "Alberto Bernacchia",
+      "Da-shan Shiu",
+      "Michael Bromberg",
+      "Alexandru Cioba"
+    ],
+    "emails": [
+      "mtkresearch.com",
+      "~Alberto_Bernacchia1",
+      "gmail.com"
+    ],
+    "rank": 2211
+  },
+  {
+    "url": "https://openreview.net/forum?id=Fa3a14yX8zA",
+    "ratings": [
+      4,
+      4,
+      6,
+      5
+    ],
+    "rating": "4.57",
+    "confidences": [
+      5,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Typically in machine learning, training and tuning are done in an alternating manner: for a fixed set of hyperparameters <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi></math></mjx-assistive-mml></mjx-container>, we apply gradient descent to our objective <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi><mo stretchy=\"false\">(</mo><mi>x</mi><mo>,</mo><mi>y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> over trainable variables <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi></math></mjx-assistive-mml></mjx-container> until convergence; then, we apply a tuning step over <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi></math></mjx-assistive-mml></mjx-container> to find another promising setting of hyperparameters. Because the full training cycle is completed before a tuning step is applied, the optimization procedure greatly emphasizes the gradient step, which seems justified as first-order methods provides a faster convergence rate. In this paper, we argue that an equal emphasis on training and tuning lead to faster convergence both theoretically and empirically. We present Joint Descent (JD) and a novel theoretical analysis of acceleration via an unbiased gradient estimate to give an optimal iteration complexity of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.281em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msqrt><mi>\u03ba</mi></msqrt><msub><mi>n</mi><mi>y</mi></msub><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mo stretchy=\"false\">(</mo><mi>n</mi><mrow><mo>/</mo></mrow><mi>\u03f5</mi><mo stretchy=\"false\">)</mo><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03ba</mi></math></mjx-assistive-mml></mjx-container> is the condition number and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>n</mi><mi>y</mi></msub></math></mjx-assistive-mml></mjx-container> is the dimension of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi></math></mjx-assistive-mml></mjx-container>. This provably improves upon the naive classical bound and implies that we essentially train for free if we apply equal emphasis on training and tuning steps. Empirically, we observe that an unbiased gradient estimate achieves the best convergence results, supporting our theory. ",
+    "title": "Joint Descent: Training and Tuning Simultaneously",
+    "authors": [
+      "Qiuyi Zhang"
+    ],
+    "emails": [
+      "~Qiuyi_Zhang1"
+    ],
+    "rank": 2212
+  },
+  {
+    "url": "https://openreview.net/forum?id=e-ZdxsIwweR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Many real-world physical control systems are required to satisfy constraints upon deployment. Furthermore, real-world systems are often subject to effects such as non-stationarity, wear-and-tear, uncalibrated sensors and so on. Such effects effectively perturb the system dynamics and can cause a policy trained successfully in one domain to perform poorly when deployed to a perturbed version of the same domain. This can affect a policy's ability to maximize future rewards as well as the extent to which it satisfies constraints. We refer to this as constrained model misspecification. We present an algorithm with theoretical guarantees that mitigates this form of misspecification, and showcase its performance in multiple Mujoco tasks from the Real World Reinforcement Learning (RWRL) suite.",
+    "title": "Robust Constrained Reinforcement Learning for Continuous Control with Model Misspecification",
+    "authors": [
+      "Daniel J Mankowitz",
+      "Dan Andrei Calian",
+      "Rae Jeong",
+      "Cosmin Paduraru",
+      "Nicolas Heess",
+      "Sumanth Dathathri",
+      "Martin Riedmiller",
+      "Timothy Mann"
+    ],
+    "emails": [
+      "~Sumanth_Dathathri1",
+      "~Daniel_J_Mankowitz2",
+      "~Cosmin_Paduraru1",
+      "google.com",
+      "~Nicolas_Heess1",
+      "~Martin_Riedmiller1",
+      "~Timothy_Mann1",
+      "~Dan_Andrei_Calian1"
+    ],
+    "rank": 2213
+  },
+  {
+    "url": "https://openreview.net/forum?id=65_RUwah5kr",
+    "ratings": [
+      5,
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "While the celebrated graph neural networks yield effective representations for individual nodes of a graph, there has been relatively less success in extending to graph similarity learning. Recent works have considered either global-level graph-graph interactions or low-level node-node interactions, ignoring the rich cross-level interactions (e.g., between nodes of a graph and the other whole graph). In this paper, we propose a Multi-level Graph Matching Network (MGMN) for computing the graph similarity between any pair of graph-structured objects in an end-to-end fashion. The proposed MGMN model consists of a node-graph matching network for effectively learning cross-level interactions between nodes of a graph and the other whole graph, and a siamese graph neural network to learn global-level interactions between two graphs. Furthermore, to bridge the gap of the lack of standard graph similarity learning benchmark, we have created and collected a set of datasets for both graph-graph classification and regression tasks with different sizes in order to evaluate the robustness of models. Our comprehensive experiments demonstrate that MGMN consistently outperforms state-of-the-art baselines on these graph similarity learning benchmarks. ",
+    "title": "Multi-level Graph Matching Networks for Deep and Robust Graph Similarity Learning",
+    "authors": [
+      "Xiang Ling",
+      "Lingfei Wu",
+      "Saizhuo Wang",
+      "Tengfei Ma",
+      "Fangli Xu",
+      "Alex X. Liu",
+      "Chunming Wu",
+      "Shouling Ji"
+    ],
+    "emails": [
+      "~Tengfei_Ma1",
+      "~Saizhuo_Wang1",
+      "~Lingfei_Wu1",
+      "~Fangli_Xu2",
+      "~Shouling_Ji1",
+      "antfin.com",
+      "zju.edu.cn",
+      "~Xiang_Ling1"
+    ],
+    "rank": 2214
+  },
+  {
+    "url": "https://openreview.net/forum?id=6GkL6qM3LV",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      5,
+      4
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Binary decompilation is a powerful technique for analyzing and understanding software, when source code is unavailable. It is a critical problem in the computer security domain. With the success of neural machine translation (NMT), recent efforts on neural-based decompiler show promising results compared to traditional approaches. However, several key challenges remain: (i) Prior neural-based decompilers focus on simplified programs without considering sophisticated yet widely-used data types such as pointers; furthermore, many high-level expressions map to the same low-level code (expression collision), which incurs critical decompiling performance degradation; (ii) State-of-the-art NMT models(e.g., transformer and its variants) mainly deal with sequential data; this is inefficient for decompilation, where the input and output data are highly structured. In this paper, we propose N-Bref, a new framework for neural decompilers that addresses the two aforementioned challenges with two key design principles: (i)N-Bref designs a structural transformer with three key design components for better comprehension of structural data \u2013 an assembly encoder, an abstract syntax tree encoder, and a tree decoder, extending transformer models in the context of decompilation. (ii) N-Bref introduces a program generation tool that can control the complexity of code generation and removes expression collisions. Extensive experiments demonstrate that N-Bref outperforms previous neural-based decompilers by a margin of 6.1%/8.8% accuracy in datatype recovery and source code generation. In particular, N-Bref decompiled human-written Leetcode programs with complex library calls and data types in high accuracy.",
+    "title": "N-Bref : A High-fidelity Decompiler Exploiting Programming Structures ",
+    "authors": [
+      "Cheng Fu",
+      "Kunlin Yang",
+      "Xinyun Chen",
+      "Yuandong Tian",
+      "Jishen Zhao"
+    ],
+    "emails": [
+      "eng.ucsd.edu",
+      "~Cheng_Fu1",
+      "~Yuandong_Tian1",
+      "~Jishen_Zhao1",
+      "~Xinyun_Chen1"
+    ],
+    "rank": 2215
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZHADKD4pl5H",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      7
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Many real-world graphs are attributed graphs where nodes are associated with non-topological features. While attributes can be missing anywhere in an attributed graph, most of existing node representation learning approaches do not consider such incomplete information.\nIn this paper, we propose a general non-parametric framework to mitigate this problem. Starting from a decomposition of the attribute matrix, we transform node features into discrete distributions in a lower-dimensional space equipped with the Wasserstein metric. On this Wasserstein space, we propose Wasserstein graph diffusion to smooth the distributional representations of nodes with information from their local neighborhoods. This allows us to reduce the distortion caused by missing attributes and obtain integrated representations expressing information of both topology structures and attributes. We then pull the nodes back to the original space and produce corresponding point representations to facilitate various downstream tasks. To show the power of our representation method, we designed two algorithms based on it for node classification (with missing attributes) and matrix completion respectively, and demonstrate their effectiveness in experiments.",
+    "title": "Wasserstein diffusion on graphs with missing attributes",
+    "authors": [
+      "Zhixian Chen",
+      "Tengfei Ma",
+      "Yangqiu Song",
+      "Yang Wang"
+    ],
+    "emails": [
+      "ust.hk",
+      "~Tengfei_Ma1",
+      "~Zhixian_Chen1",
+      "~Yangqiu_Song1"
+    ],
+    "rank": 2216
+  },
+  {
+    "url": "https://openreview.net/forum?id=aYbCpFNnHdh",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.57",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Different types of \\emph{mental rotation tests} have been used extensively in psychology to understand human visual reasoning and perception. Understanding what an object or visual scene would look like from another viewpoint is a challenging problem that is made even harder if it must be performed from a single image. 3D computer vision has a long history of examining related problems. However, often what one is most interested in is the answer to a relatively simple question posed in another visual frame of reference -- as opposed to creating a full 3D reconstruction. \nMental rotations tests can also manifest as consequential questions in the real world such as: does the pedestrian that I see, see the car that I am driving?\nWe explore a controlled setting whereby questions are posed about the properties of a scene if the scene were observed from another viewpoint. To do this we have created a new version of the CLEVR VQA problem setup and dataset that we call CLEVR Mental Rotation Tests or CLEVR-MRT, where the goal is to answer questions about the original CLEVR viewpoint given a single image obtained from a different viewpoint of the same scene. Using CLEVR Mental Rotation Tests we examine standard state of the art methods, show how they fall short, then explore novel neural architectures that involve inferring representations encoded as feature volumes describing a scene. Our new methods use rigid transformations of feature volumes conditioned on the viewpoint camera. We examine the efficacy of different model variants through performing a rigorous ablation study. Furthermore, we examine the use of contrastive learning to infer a volumetric encoder in a self-supervised manner and find that this approach yields the best results of our study using CLEVR-MRT.",
+    "title": "Visual Question Answering From Another Perspective: CLEVR Mental Rotation Tests",
+    "authors": [
+      "Christopher Beckham",
+      "Martin Weiss",
+      "Florian Golemo",
+      "Sina Honari",
+      "Derek Nowrouzezahrai",
+      "Christopher Pal"
+    ],
+    "emails": [
+      "~Florian_Golemo1",
+      "~Derek_Nowrouzezahrai1",
+      "~Martin_Weiss4",
+      "~Christopher_Pal1",
+      "~Sina_Honari1",
+      "~Christopher_Beckham1"
+    ],
+    "rank": 2217
+  },
+  {
+    "url": "https://openreview.net/forum?id=HI0j7omXTaG",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Training sparse networks to converge to the same performance as dense neural architectures has proven to be elusive. Recent work suggests that initialization is the key. However, while this direction of research has had some success, focusing on initialization alone appears to be inadequate. In this paper, we take a broader view of training sparse networks and consider various choices made during training that might disadvantage sparse networks. We measure the gradient flow across different networks and datasets, and show that the default choices of optimizers, activation functions and regularizers used for dense networks can disadvantage sparse networks. Based upon these findings, we show that gradient flow in sparse networks can be improved by reconsidering aspects of the architecture design and the training regime. Our work suggests that initialization is only one piece of the puzzle and a wider view of tailoring optimization to sparse networks yields promising results.   ",
+    "title": "Keep the Gradients Flowing: Using Gradient Flow to study Sparse Network Optimization",
+    "authors": [
+      "Kale-ab Tessera",
+      "Sara Hooker",
+      "Benjamin Rosman"
+    ],
+    "emails": [
+      "~Benjamin_Rosman1",
+      "~Sara_Hooker1",
+      "~Kale-ab_Tessera1"
+    ],
+    "rank": 2218
+  },
+  {
+    "url": "https://openreview.net/forum?id=EwsLcX5NRKr",
+    "ratings": [
+      3,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      2,
+      5,
+      3
+    ],
+    "abstract": "Active learning aims to reduce the labeling costs by selecting only samples that are informative to improve the accuracy of the network. Few existing works have addressed the problem of active learning for object detection, and most of them estimate the informativeness of an image based only on the classification head, neglecting the influence of the localization head. In this paper, we propose a novel deep active learning approach for object detection. Our approach relies on mixture density networks to provide a mixture distribution for every output parameter. Based on these distributions, our approach is able to compute, separately and in a single forward pass of a single model, the epistemic and aleatoric uncertainty. We further propose a more efficient approach to reduce the computational cost of the mixture model. For active learning, we propose a scoring function that aggregates uncertainties from both the classification and the localization outputs of the network. Our extensive set of experiments on PASCAL VOC and COCO demonstrates that our modification to the object detection network yields better accuracy compared to the original one and, for active learning, our approach outperforms single-model based methods and performs on par when compared to methods using multiple models while requiring significantly lower computational cost. In addition, we show that our approach scales to different object detection networks and datasets acquired actively using our approach to transfer to different networks.",
+    "title": "Deep Active Learning for Object Detection with Mixture Density Networks",
+    "authors": [
+      "Jiwoong Choi",
+      "Ismail Elezi",
+      "Hyuk-Jae Lee",
+      "Clement Farabet",
+      "Jose M. Alvarez"
+    ],
+    "emails": [
+      "~Ismail_Elezi1",
+      "~Hyuk-Jae_Lee1",
+      "~Clement_Farabet1",
+      "~Jiwoong_Choi1",
+      "~Jose_M._Alvarez2"
+    ],
+    "rank": 2219
+  },
+  {
+    "url": "https://openreview.net/forum?id=TaUJl6Kt3rW",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6
+    ],
+    "rating": "4.57",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "In the age of digital recruitment, job posts can attract a large number of applications, and screening them manually can become a very tedious task. These recruitment records are stored in the form of tables in our recruitment database (Electronic Recruitment Records, referred to as ERRs). We have released a de-identified ERR dataset to the public domain. We also propose a BERT-based model, SkillBERT, the embeddings of which are used as features for classifying skills present in the ERRs into groups referred to as \"competency groups\". A competency group is a group of similar skills and it is used as matching criteria (instead of matching on skills) for finding the overlap of skills between the candidates and the jobs. This proxy match takes advantage of the BERT's capability of deriving meaning from the structure of competency groups present in the skill dataset. In our experiments, the SkillBERT, which is trained from scratch on the skills present in job requisitions, is shown to be better performing than the pre-trained BERT and the Word2Vec. We have also explored K-means clustering and spectral clustering on SkillBERT embeddings to generate cluster-based features. Both algorithms provide similar performance benefits. Last, we have experimented with different machine learning algorithms like Random Forest, XGBoost, and a deep learning algorithm Bi-LSTM . We did not observe a significant performance difference among the algorithms, although XGBoost and Bi-LSTM perform slightly better than Random Forest. The features created using SkillBERT are most predictive in the classification task, which demonstrates that the SkillBERT is able to capture information about the skills' ontology from the data. We have made the source code and the trained models of our experiments publicly available.",
+    "title": "SkillBERT: \u201cSkilling\u201d the BERT to classify skills!",
+    "authors": [
+      "Amber Nigam",
+      "Shikha Tyagi",
+      "Kuldeep Tyagi",
+      "Arpan Saxena"
+    ],
+    "emails": [
+      "hsph.harvard.edu",
+      "gmail.com",
+      "peoplestrong.com"
+    ],
+    "rank": 2220
+  },
+  {
+    "url": "https://openreview.net/forum?id=M_gk45ItxIp",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.57",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Recent progress in deep reinforcement learning (DRL) can be largely attributed to the use of neural networks.  However, this black-box approach fails to explain the learned policy in a human understandable way.  To address this challenge and improve the transparency, we introduce symbolic logic into DRL and propose a Neural Symbolic Reinforcement Learning framework, in which states and actions are represented in an interpretable way using first-order logic.  This framework features a relational reasoning module, which performs on task-level in Hierarchical Reinforcement Learning, enabling end-to-end learning with prior symbolic knowledge.  Moreover, interpretability is enabled by extracting the logical rules learned by the reasoning module in a symbolic rule space, providing explainability on task level. Experimental results demonstrate better interpretability of subtasks, along with competing performance compared with existing approaches.",
+    "title": "Interpretable Reinforcement Learning With Neural Symbolic Logic",
+    "authors": [
+      "Zhihao Ma",
+      "Yuzheng Zhuang",
+      "Paul Weng",
+      "Dong Li",
+      "Kun Shao",
+      "Wulong Liu",
+      "Hankz Hankui Zhuo",
+      "Jianye HAO"
+    ],
+    "emails": [
+      "~Hankz_Hankui_Zhuo2",
+      "~Zhihao_Ma1",
+      "~Wulong_Liu1",
+      "~Yuzheng_Zhuang1",
+      "~Paul_Weng1",
+      "~Jianye_HAO1",
+      "~Kun_Shao1",
+      "~Dong_Li10"
+    ],
+    "rank": 2221
+  },
+  {
+    "url": "https://openreview.net/forum?id=pdsec2YIOCx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.56",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The core principle behind most collaborative filtering methods is to embed users and items in latent spaces, where individual dimensions are learned independently of any particular item attributes. It is thus difficult for users to control their recommendations based on particular aspects (critiquing). In this work, we propose Untangle: a recommendation model that gives users control over the recommendation list with respect to specific item attributes, (e.g.:less violent, funnier movies) that have a causal relationship in user preferences. Untangle uses a refined training procedure by training (i) a (partially) supervised \u03b2-VAE that disentangles the item representations and (ii) a second phase which optimized to generate recommendations for users. Untangle gives control on critiquing recommendations based on users preferences, without sacrificing on recommendation accuracy. Moreover only a tiny fraction of labeled items is needed to create disentangled preference representations over attributes. \n\n",
+    "title": "Untangle: Critiquing Disentangled Recommendations",
+    "authors": [
+      "Preksha Nema",
+      "Alexandros Karatzoglou",
+      "Filip Radlinski"
+    ],
+    "emails": [
+      "~Alexandros_Karatzoglou1",
+      "~Filip_Radlinski1",
+      "~Preksha_Nema1"
+    ],
+    "rank": 2222
+  },
+  {
+    "url": "https://openreview.net/forum?id=4CqesJ7GO7Q",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.56",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Adversarial training is one of the most effective approaches to improve model robustness against adversarial examples. However, previous works mainly focus on the overall robustness of the model, and the in-depth analysis on the role of each class involved in adversarial training is still missing. In this paper, we provide the first detailed class-wise diagnosis of adversarial training on six widely used datasets, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2E TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2E TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">i.e.</mtext></math></mjx-assistive-mml></mjx-container>, MNIST, CIFAR-10, CIFAR-100, SVHN, STL-10 and ImageNet. Surprisingly, we find that there are <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">remarkable robustness discrepancies among classes</mtext></math></mjx-assistive-mml></mjx-container>, demonstrating the following intriguing properties: 1) Many examples from a certain class could only be maliciously attacked to some specific semantic-similar classes, and these examples will not exist adversarial counterparts in bounded <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-ball if we re-train the model without those specific classes; 2) The robustness of each class is positively correlated with its norm of classifier weight in deep neural networks; 3) Stronger attacks are usually more powerful for vulnerable classes. Finally, we propose an attack to better understand the defense mechanism of some state-of-the-art models from the class-wise perspective. We believe these findings can contribute to a more comprehensive understanding of adversarial training as well as further improvement of adversarial robustness.",
+    "title": "Intriguing class-wise properties of adversarial training",
+    "authors": [
+      "Qi Tian",
+      "Kun Kuang",
+      "Fei Wu",
+      "Yisen Wang"
+    ],
+    "emails": [
+      "~Kun_Kuang1",
+      "~Qi_Tian6",
+      "~Yisen_Wang1",
+      "~Fei_Wu1"
+    ],
+    "rank": 2223
+  },
+  {
+    "url": "https://openreview.net/forum?id=8IX8Qum6jGR",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.56",
+    "confidences": [
+      3,
+      4,
+      2
+    ],
+    "abstract": " Expressivity plays a fundamental role in evaluating deep neural networks, and it is closely related to understanding the limit of performance improvement. In this paper, we propose a three-pipeline training framework based on critical expressivity, including global model contraction, weight evolution, and link's weight rewiring. Specifically, we propose a pyramidal-like skeleton to overcome the saddle points that affect information transfer. Then we analyze the reason for the modularity (clustering) phenomenon in network topology and use it to rewire potential erroneous weighted links. We conduct numerical experiments on node classification and the results confirm that the proposed training framework leads to a significantly improved performance in terms of fast convergence and robustness to potential erroneous weighted links. The architecture design on GNNs, in turn, verifies the expressivity of GNNs from dynamics and topological space aspects and provides useful guidelines in designing more efficient neural networks. The code is available at <a href=\"https://github.com/xjglgjgl/SRGNN\" target=\"_blank\" rel=\"nofollow\">https://github.com/xjglgjgl/SRGNN</a>.",
+    "title": "A Deep Graph Neural Networks Architecture Design: From Global Pyramid-like Shrinkage Skeleton to Local Link Rewiring",
+    "authors": [
+      "Gege Zhang",
+      "Gangwei Li",
+      "Weining Shen",
+      "Huixin Zhang",
+      "Weidong Zhang"
+    ],
+    "emails": [
+      "~Gege_Zhang2",
+      "nvidia.com",
+      "~Weining_Shen1",
+      "sjtu.edu.cn"
+    ],
+    "rank": 2224
+  },
+  {
+    "url": "https://openreview.net/forum?id=wXBt-7VM2JE",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      8,
+      5
+    ],
+    "rating": "4.56",
+    "confidences": [
+      5,
+      2,
+      2
+    ],
+    "abstract": "Extrapolation in graph classification/regression remains an underexplored area of an otherwise rapidly developing field. Our work contributes to a growing literature by providing the first systematic counterfactual modeling framework for extrapolations in graph classification/regression tasks. To show that extrapolation from a single training environment is possible, we develop a connection between certain extrapolation tasks on graph sizes and Lovasz's characterization of graph limits. For these extrapolations, standard graph neural networks (GNNs) will fail, while classifiers using induced homomorphism densities succeed, but mostly on unattributed graphs. Generalizing these density features through a GNN subgraph decomposition allows them to also succeed in more complex attributed graph extrapolation tasks. Finally, our experiments validate our theoretical results and showcase some shortcomings of common (interpolation) methods in the literature.",
+    "title": "On Single-environment Extrapolations in Graph Classification and Regression Tasks",
+    "authors": [
+      "Beatrice Bevilacqua",
+      "Yangze Zhou",
+      "Ryan L Murphy",
+      "Bruno Ribeiro"
+    ],
+    "emails": [
+      "~Yangze_Zhou1",
+      "~Bruno_Ribeiro1",
+      "~Ryan_L_Murphy1",
+      "~Beatrice_Bevilacqua1"
+    ],
+    "rank": 2225
+  },
+  {
+    "url": "https://openreview.net/forum?id=5USOVm2HkfG",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.56",
+    "confidences": [
+      2,
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "While reinforcement learning has achieved considerable successes in recent years, state-of-the-art models are often still limited by the size of state and action spaces. Model-free reinforcement learning approaches use some form of state representations and the latest work has explored embedding techniques for actions, both with the aim of achieving better generalization and applicability. However, these approaches consider only states or actions, ignoring the interaction between them when generating embedded representations. In this work, we propose a new approach for jointly learning embeddings for states and actions that combines aspects of model-free and model-based reinforcement learning, which can be applied in both discrete and continuous domains. Specifically, we use a model of the environment to obtain embeddings for states and actions and present a generic architecture that uses these to learn a policy. In this way, the embedded representations obtained via our approach enable better generalization over both states and actions by capturing similarities in the embedding spaces. Evaluations of our approach on several gaming, robotic control, and recommender systems show it significantly outperforms state-of-the-art models in both discrete/continuous domains with large state/action spaces, thus confirming its efficacy and the overall superior performance.",
+    "title": "Jointly-Trained State-Action Embedding for Efficient Reinforcement Learning",
+    "authors": [
+      "Paul Julian Pritz",
+      "Liang Ma",
+      "Kin Leung"
+    ],
+    "emails": [
+      "~Kin_Leung1",
+      "~Liang_Ma4",
+      "~Paul_Julian_Pritz1"
+    ],
+    "rank": 2226
+  },
+  {
+    "url": "https://openreview.net/forum?id=L3iGqaCTWS9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.56",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Existing post-training quantization methods attempt to compensate for the quantization loss by determining the quantized weights and activation ranges with the help of training data. Quantization aware training methods, on the other hand, achieve accuracy near to FP32 models by training the quantized model which consume more time. Both these methods are not effective for privacy constraint applications as they are tightly coupled with training data. In contrast, this paper proposes a data-independent post-training quantization scheme that eliminates the need for training data. This is achieved by generating a faux dataset hereafter called as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c2018 TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c2019 TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">\u2018Retro-Synthesis Data\u2019</mtext></math></mjx-assistive-mml></mjx-container> from the FP32 model layer statistics and further using it for quantization. This approach outperformed state-of-the-art methods including, but not limited to, ZeroQ and DFQ on models with and without batch-normalization layers for 8, 6 and 4 bit precisions. We also introduced two futuristic variants of post-training quantization methods namely <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c2018 TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c2019 TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">\u2018Hybrid-Quantization\u2019</mtext></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c2018 TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D448 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c2019 TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">\u2018Non-Uniform Quantization\u2019</mtext></math></mjx-assistive-mml></mjx-container>. The Hybrid-Quantization scheme determines the sensitivity of each layer for per-tensor and per-channel quantization, and thereby generates hybrid quantized models that are <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>10</mn><mo>\u2212</mo><mn>20</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> efficient in inference time while achieving same or better accuracy as compared to per-channel quantization. Also this method outperformed FP32 accuracy when applied for models such as ResNet-18, and ResNet-50 onImageNet dataset. In the proposed Non-Uniform quantization scheme, the weights are grouped into different clusters and these clusters are assigned with a varied number of quantization steps depending on the number of weights and their ranges in respective cluster. This method resulted in an accuracy improvement of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> against state-of-the-art quantization methods on ImageNet dataset.",
+    "title": "Hybrid and Non-Uniform DNN quantization methods using Retro Synthesis data for efficient inference",
+    "authors": [
+      "TEJPRATAP GVSL",
+      "Raja Kumar",
+      "Pradeep NS"
+    ],
+    "emails": [
+      "samsung.com",
+      "~Raja_Kumar2",
+      "~TEJPRATAP_GVSL1"
+    ],
+    "rank": 2227
+  },
+  {
+    "url": "https://openreview.net/forum?id=xtKFuhfK1tK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.56",
+    "confidences": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Training Graph Convolutional Networks (GCNs) is expensive as it needs to aggregate data recursively from neighboring nodes. To reduce the computation overhead, previous works have proposed various neighbor sampling methods that estimate the aggregation result based on a small number of sampled neighbors. Although these methods have successfully accelerated the training, they mainly focus on the single-machine setting. As real-world graphs are large, training GCNs in distributed systems is desirable. However, we found that the existing neighbor sampling methods do not work well in a distributed setting. Specifically, a naive implementation may incur a huge amount of communication of feature vectors among different machines. To address this problem, we propose a communication-efficient neighbor sampling method in this work. Our main idea is to assign higher sampling probabilities to the local nodes so that remote nodes are accessed less frequently. We present an algorithm that determines the local sampling probabilities and makes sure our skewed neighbor sampling does not affect much to the convergence of the training. Our experiments with node classification benchmarks show that our method significantly reduces the communication overhead for distributed GCN training with little accuracy loss. \n",
+    "title": "Communication-Efficient Sampling for Distributed Training of Graph Convolutional Networks",
+    "authors": [
+      "Peng Jiang",
+      "Masuma Akter Rumi"
+    ],
+    "emails": [
+      "uiowa.edu",
+      "~Peng_Jiang4"
+    ],
+    "rank": 2228
+  },
+  {
+    "url": "https://openreview.net/forum?id=P__qBPffIlK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5
+    ],
+    "rating": "4.56",
+    "confidences": [
+      2,
+      4,
+      3
+    ],
+    "abstract": "Data privacy is an increasingly important aspect of many real-world big data analytics tasks. Data sources that contain sensitive information may have immense potential which could be unlocked using privacy enhancing transformations, but current methods often fail to produce convincing output. Furthermore, finding the right balance between privacy and utility is often a tricky trade-off. In this work, we propose a novel approach for data privatization, which involves two steps: in the first step, it removes the sensitive information, and in the second step, it replaces this information with an independent random sample. Our method builds on adversarial representation learning which ensures strong privacy by training the model to fool an increasingly strong adversary. While previous methods only aim at obfuscating the sensitive information, we find that adding new random information in its place strengthens the provided privacy and provides better utility at any given level of privacy. The result is an approach that can provide stronger privatization on image data, and yet be preserving both the domain and the utility of the inputs, entirely independent of the downstream task.",
+    "title": "Adversarial representation learning for synthetic replacement of private attributes",
+    "authors": [
+      "John Martinsson",
+      "Edvin Listo Zec",
+      "Daniel Gillblad",
+      "Olof Mogren"
+    ],
+    "emails": [
+      "~Daniel_Gillblad1",
+      "~Olof_Mogren1",
+      "~John_Martinsson1",
+      "~Edvin_Listo_Zec1"
+    ],
+    "rank": 2229
+  },
+  {
+    "url": "https://openreview.net/forum?id=MWj_P-Lk3jC",
+    "ratings": [
+      7,
+      5,
+      3,
+      3
+    ],
+    "rating": "4.56",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "\"``Nonstationarity\" is a fundamental problem in cooperative multi-agent reinforcement learning (MARL). It results from every agent's policy changing during learning, while being part of the environment from the perspective of other agents. This causes information to inherently oscillate between agents during learning, greatly slowing convergence. We use the MAILP model of information transfer during multi-agent learning to show that increasing centralization during learning arbitrarily mitigates the slowing of convergence due to nonstationarity. The most centralized case of learning is parameter sharing, an uncommonly used MARL method, specific to environments with homogeneous agents. It bootstraps single-agent reinforcement learning (RL) methods and learns an identical policy for each agent. We experimentally replicate our theoretical result of increased learning centralization leading to better performance. We further apply parameter sharing to 8 more modern single-agent deep RL methods for the first time, achieving up to 44 times more average reward in 16% as many episodes compared to previous parameter sharing experiments. We finally give a formal proof of a set of methods that allow parameter sharing to serve in environments with heterogeneous agents.",
+    "title": "Revisiting Parameter Sharing in Multi-Agent Deep Reinforcement Learning",
+    "authors": [
+      "Justin K Terry",
+      "Nathaniel Grammel",
+      "Ananth Hari",
+      "Luis Santos",
+      "Benjamin Black"
+    ],
+    "emails": [
+      "umd.edu",
+      "swarmlabs.com",
+      "~Justin_K_Terry1"
+    ],
+    "rank": 2230
+  },
+  {
+    "url": "https://openreview.net/forum?id=EZ8aZaCt9k",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.56",
+    "confidences": [
+      3,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Empirical studies suggest that wide neural networks are comparably easy to optimize, but mathematical support for this observation is scarce. In this paper, we analyze the optimization landscapes of deep learning with wide networks. We prove especially that constraint and unconstraint empirical-risk minimization over such networks has no spurious local minima. Hence, our theories substantiate the common belief that increasing network widths not only improves the expressiveness of deep-learning pipelines but also facilitates their optimizations.",
+    "title": "No Spurious Local Minima: on the Optimization Landscapes of Wide and Deep Neural Networks",
+    "authors": [
+      "Johannes Lederer"
+    ],
+    "emails": [
+      "~Johannes_Lederer1"
+    ],
+    "rank": 2231
+  },
+  {
+    "url": "https://openreview.net/forum?id=INXUNEmgbnx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.56",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We introduce a parameterization method called Neural Bayes which allows computing statistical quantities that are in general difficult to compute and opens avenues for formulating new objectives for unsupervised representation learning. Specifically, given an observed random variable <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">x</mi></mrow></math></mjx-assistive-mml></mjx-container> and a latent discrete variable <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container>, we can express <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>z</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo stretchy=\"false\">(</mo><mi>z</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>p</mi><mo stretchy=\"false\">(</mo><mi>z</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> in closed form in terms of a sufficiently expressive function (Eg. neural network) using our parameterization without restricting the class of these distributions. To demonstrate its usefulness, we develop two independent use cases for this parameterization: \n\n1. Disjoint Manifold Separation: Neural Bayes allows us to formulate an objective which can optimally label samples from disjoint manifolds present in the support of a continuous distribution. This can be seen as a specific form of clustering where each disjoint manifold in the support is a separate cluster. We design clustering tasks that obey this formulation and empirically show that the model optimally labels the disjoint manifolds.\n\n2. Mutual Information Maximization (MIM): MIM has become a popular means for self-supervised representation learning. Neural Bayes allows us to compute mutual information between observed random variables <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">x</mi></mrow></math></mjx-assistive-mml></mjx-container> and latent discrete random variables <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>z</mi></math></mjx-assistive-mml></mjx-container> in closed form. We use this for learning image representations and show its usefulness on downstream classification tasks. ",
+    "title": "Neural Bayes: A Generic Parameterization Method for Unsupervised Learning",
+    "authors": [
+      "Devansh Arpit",
+      "Huan Wang",
+      "Caiming Xiong",
+      "richard socher",
+      "Yoshua Bengio"
+    ],
+    "emails": [
+      "~Devansh_Arpit2",
+      "~Huan_Wang1",
+      "~Yoshua_Bengio1",
+      "~richard_socher1",
+      "~Caiming_Xiong1"
+    ],
+    "rank": 2232
+  },
+  {
+    "url": "https://openreview.net/forum?id=f6gtnqp4u6K",
+    "ratings": [
+      5,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.56",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Due to the catastrophic forgetting phenomenon of deep neural networks (DNNs), models trained in standard ways tend to forget what it has learned from previous tasks, especially when the new task is sufficiently different from the previous ones. To overcome this issue, various continual learning techniques have been developed in recent years, which, however, often suffer from a substantially increased model complexity and training time. In this paper, we consider whether properly tailored simple models could perform well for continual learning. By proposing a relatively simple method based on Bayesian neural networks and model selection, we can in many cases outperform several state-of-the-art techniques in terms of accuracy, model size, and running time, especially when each mini-batch of data is known to come from the same task of an unknown identity. This leads to interesting observations suggesting that different continual learning techniques may be beneficial for different types of data and task diversity.\n",
+    "title": "Continual Learning Without Knowing Task Identities: Do Simple Models Work?",
+    "authors": [
+      "Tiffany Tuor",
+      "Shiqiang Wang",
+      "Kin Leung"
+    ],
+    "emails": [
+      "imperial.ac.uk",
+      "~Shiqiang_Wang1",
+      "~Kin_Leung1"
+    ],
+    "rank": 2233
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZwZ3sc0qad",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      4,
+      4
+    ],
+    "rating": "4.56",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "    We study the properties of alignment, a form of implicit regularization, in linear neural networks under gradient descent.  We define alignment for fully connected networks with multidimensional outputs and show that it is a natural extension of alignment in networks with 1-dimensional outputs as defined by Ji and Telgarsky, 2018.  While in fully connected networks, there always exists a global minimum corresponding to an aligned solution, we analyze alignment as it relates to the training process.  Namely, we characterize when alignment is an invariant of training under gradient descent by providing necessary and sufficient conditions for this invariant to hold. In such settings, the dynamics of gradient descent simplify, thereby allowing us to provide an explicit learning rate under which the network converges linearly to a global minimum.  We then analyze networks with layer constraints such as convolutional networks. In this setting, we prove that gradient descent is equivalent to projected gradient descent, and that alignment is impossible with sufficiently large datasets.  \n",
+    "title": "On Alignment in Deep Linear Neural Networks",
+    "authors": [
+      "Adityanarayanan Radhakrishnan",
+      "Eshaan Nichani",
+      "Daniel Bernstein",
+      "Caroline Uhler"
+    ],
+    "emails": [
+      "~Adityanarayanan_Radhakrishnan1",
+      "~Caroline_Uhler1",
+      "~Eshaan_Nichani1",
+      "mit.edu"
+    ],
+    "rank": 2234
+  },
+  {
+    "url": "https://openreview.net/forum?id=3R--2TdxMps",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.55",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "With the greater proliferation of machine learning models, the imperative of diagnosing and correcting bugs in models has become increasingly clear. As a route to better discover and fix model bugs, we propose failure scenarios: regions on the data manifold that are incorrectly classified by a model. We propose an end-to-end debugging framework called Defuse to use these regions for fixing faulty classifier predictions. The Defuse framework works in three steps. First, Defuse identifies many unrestricted adversarial examples--naturally occurring instances that are misclassified--using a generative model. Next, the procedure distills the misclassified data using clustering into failure scenarios. Last, the method corrects model behavior on the distilled scenarios through an optimization based approach. We illustrate the utility of our framework on a variety of image data sets. We find that Defuse identifies and resolves concerning predictions while maintaining model generalization.",
+    "title": "Defuse: Debugging Classifiers Through Distilling Unrestricted Adversarial Examples",
+    "authors": [
+      "Dylan Z Slack",
+      "Nathalie Rauschmayr",
+      "Krishnaram Kenthapadi"
+    ],
+    "emails": [
+      "amazon.com",
+      "~Dylan_Z_Slack1",
+      "~Krishnaram_Kenthapadi1"
+    ],
+    "rank": 2235
+  },
+  {
+    "url": "https://openreview.net/forum?id=uC7QRdX5OjI",
+    "ratings": [
+      4,
+      4,
+      4,
+      7
+    ],
+    "rating": "4.55",
+    "confidences": [
+      3,
+      3,
+      3,
+      2
+    ],
+    "abstract": "We seek to create agents that both act and communicate with other agents in pursuit of a goal. Towards this end, we extend LIGHT (Urbanek et al. 2019)---a large-scale crowd-sourced fantasy text-game---with a dataset of quests. These contain natural language motivations paired with in-game goals and human demonstrations;  completing a quest might require dialogue or actions (or both). We introduce a reinforcement learning system that (1) incorporates large-scale language modeling-based and commonsense reasoning-based pre-training to imbue the agent with relevant priors; and (2) leverages a factorized action space of action commands and dialogue, balancing between the two. We conduct zero-shot evaluations using held-out human expert demonstrations, showing that our agents are able to act consistently and talk naturally with respect to their motivations.",
+    "title": "How to Motivate Your Dragon: Teaching Goal-Driven Agents to Speak and Act in Fantasy Worlds",
+    "authors": [
+      "Prithviraj Ammanabrolu",
+      "Jack Urbanek",
+      "Margaret Li",
+      "Arthur Szlam",
+      "Tim Rockt\u00e4schel",
+      "Jason E Weston"
+    ],
+    "emails": [
+      "~Margaret_Li1",
+      "~Jack_Urbanek1",
+      "~Tim_Rockt\u00e4schel1",
+      "~Arthur_Szlam1",
+      "~Prithviraj_Ammanabrolu1",
+      "~Jason_E_Weston1"
+    ],
+    "rank": 2236
+  },
+  {
+    "url": "https://openreview.net/forum?id=DQpwoZgqyZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.55",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Information-theoretic perspectives can provide an alternative dimension of analyzing the learning process and complements usual performance metrics. Recently several works proposed methods for quantifying information content in a model (which we refer to as \"model information\"). We demonstrate using model information as a general analysis tool to gain insight into problems that arise in deep learning. By utilizing model information in different scenarios with different control variables, we are able to adapt model information to analyze fundamental elements of learning, i.e., task, data, model, and algorithm. We provide an example in each domain that model information is used as a tool to provide new solutions to problems or to gain insight into the nature of the particular learning setting. These examples help to illustrate the versatility and potential utility of model information as an analysis tool in deep learning.",
+    "title": "Model information as an analysis tool in deep learning",
+    "authors": [
+      "Xiao Zhang",
+      "Di Hu",
+      "Xingjian Li",
+      "Dejing Dou",
+      "Ji Wu"
+    ],
+    "emails": [
+      "baidu.com",
+      "~Xiao_Zhang9",
+      "~Dejing_Dou1",
+      "mail.tsinghua.edu.cn",
+      "~Di_Hu1"
+    ],
+    "rank": 2237
+  },
+  {
+    "url": "https://openreview.net/forum?id=uvEgLKYMBF9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4
+    ],
+    "rating": "4.55",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Variational autoencoders with deep hierarchies of stochastic layers have been known to suffer from the problem of posterior collapse, where the top layers fall back to the prior and become independent of input.\nWe suggest that the hierarchical VAE objective explicitly includes the variance of the function parameterizing the mean and variance of the latent Gaussian distribution which itself is often a high variance function.\nBuilding on this we generalize VAE neural networks by incorporating a smoothing parameter motivated by Gaussian analysis to reduce variance in parameterizing functions and show that this can help to solve the problem of posterior collapse.\nWe further show that under such smoothing the VAE loss exhibits a phase transition, where the top layer KL divergence sharply drops to zero at a critical value of the smoothing parameter.\nWe validate the phenomenon across model configurations and datasets.",
+    "title": "Variance Reduction in Hierarchical Variational Autoencoders",
+    "authors": [
+      "Adeel Pervez",
+      "Efstratios Gavves"
+    ],
+    "emails": [
+      "~Adeel_Pervez1",
+      "~Efstratios_Gavves1"
+    ],
+    "rank": 2238
+  },
+  {
+    "url": "https://openreview.net/forum?id=0gfSzsRDZFw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4
+    ],
+    "rating": "4.55",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "We consider the saliency problem for black-box classification. In image classification, this means highlighting the part of the image that is most relevant for the current decision.\nWe cast the saliency problem as finding an optimal ablation path between two images. An ablation path consists of a sequence of ever smaller masks, joining the current image to a reference image in another decision region. The optimal path will stay as long as possible in the current decision region. This approach extends the ablation tests in [Sturmfels et al. (2020)]. The gradient of the corresponding objective function is closely related to the integrated gradient method [Sundararajan et al. (2017)]. In the saturated case (when the classifier outputs a binary value) our method would reduce to the meaningful perturbation approach [Fong &amp; Vedaldi (2017)], since crossing the decision boundary as late as\npossible would then be equivalent to finding the smallest possible mask lying on\nthe decision boundary.\nOur interpretation provides geometric understanding of existing saliency methods, and suggests a novel approach based on ablation path optimisation.",
+    "title": "Ablation Path Saliency",
+    "authors": [
+      "Olivier Verdier",
+      "Justus Sagem\u00fcller"
+    ],
+    "emails": [
+      "~Olivier_Verdier1",
+      "~Justus_Sagem\u00fcller1"
+    ],
+    "rank": 2239
+  },
+  {
+    "url": "https://openreview.net/forum?id=WN_6sThEI_-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.54",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Recurrent neural networks have proven effective in modeling sequential user feedbacks for recommender systems. However, they usually focus solely on item relevance and fail to effectively explore diverse items for users, therefore harming the system performance in the long run. To address this problem, we propose a new type of recurrent neural networks, dubbed recurrent exploration networks (REN), to jointly perform representation learning and effective exploration in the latent space. REN tries to balance relevance and exploration while taking into account the uncertainty in the representations. Our theoretical analysis shows that REN can preserve the rate-optimal sublinear regret (Chu et al., 2011) even when there exists uncertainty in the learned representations. Our empirical study demonstrates that REN can achieve satisfactory long-term rewards on both synthetic and real-world recommendation datasets, outperforming state-of-the-art models. ",
+    "title": "Recurrent Exploration Networks for Recommender Systems",
+    "authors": [
+      "Hao Wang",
+      "Yifei Ma",
+      "Hao Ding",
+      "Bernie Wang"
+    ],
+    "emails": [
+      "~Yifei_Ma1",
+      "~Bernie_Wang1",
+      "~Hao_Wang3",
+      "amazon.com"
+    ],
+    "rank": 2240
+  },
+  {
+    "url": "https://openreview.net/forum?id=0vO-u0sucRF",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.54",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We formulate meta learning using information theoretic concepts such as mutual information and the information bottleneck. The idea is to learn a stochastic representation or encoding of the task description, given by a training or support set, that is highly informative about predicting the validation set. By making use of variational approximations to the mutual information, we derive a general and tractable framework for meta learning. We  particularly develop new memory-based meta learning algorithms based on Gaussian processes and derive extensions that combine memory and gradient-based meta learning. We demonstrate our method on few-shot regression and classification by using standard benchmarks such as Omniglot, mini-Imagenet and Augmented Omniglot.\n",
+    "title": "Information Theoretic Meta Learning with Gaussian Processes",
+    "authors": [
+      "Michalis Titsias",
+      "Sotirios Nikoloutsopoulos",
+      "Alexandre Galashov"
+    ],
+    "emails": [
+      "~Alexandre_Galashov1",
+      "~Michalis_Titsias1",
+      "aueb.gr"
+    ],
+    "rank": 2241
+  },
+  {
+    "url": "https://openreview.net/forum?id=nMefdZyJ7ie",
+    "ratings": [
+      4,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.54",
+    "confidences": [
+      5,
+      3,
+      3,
+      2
+    ],
+    "abstract": "We present Neural Disjunctive Normal Form (Neural DNF), a hybrid neuro- symbolic classifier that vertically integrates propositional logic with a deep neural network. Here, we aim at a vertical integration of logic and deep learning: we utilize the ability of deep neural networks as feature extractors to extract intermediate representation from data, and then a Disjunctive Normal Form (DNF) module to perform logical rule-based classification; we also seek this integration to be tight that these two normally-incompatible modules can be learned in an end-to-end manner, for which we propose the BOAT algorithm.\nCompared with standard deep classifiers which use a linear model or variants of additive model as the classification head, Neural DNF provides a new choice of model based on logic rules. It offers interpretability via an explicit symbolic representation, strong model expressity, and a different type of model inductive bias. Neural DNF is particularly suited for certain tasks that require some logical composition and provides extra interpretability.",
+    "title": "Neural Disjunctive Normal Form: Vertically Integrating Logic With Deep Learning For Classification",
+    "authors": [
+      "Jialin Lu",
+      "Martin Ester"
+    ],
+    "emails": [
+      "~Jialin_Lu1",
+      "~Martin_Ester1"
+    ],
+    "rank": 2242
+  },
+  {
+    "url": "https://openreview.net/forum?id=a-_HfiIow3m",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.54",
+    "confidences": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Multimodal generative models should be able to learn a meaningful latent representation that enables a coherent joint generation of all modalities (e.g., images and text). Many applications also require the ability to accurately sample modalities conditioned on observations of a subset of the modalities. Often not all modalities may be observed for all training data points, so  semi-supervised learning should be possible.\nIn this study, we evaluate a family of product-of-experts (PoE) based variational autoencoders that have these desired properties. We include a novel PoE based architecture and training procedure. An empirical evaluation shows that the PoE based models can outperform an additive mixture-of-experts (MoE) approach.\nOur experiments support the intuition that PoE models are more suited for a conjunctive combination of modalities while MoEs are more suited for a disjunctive fusion. ",
+    "title": "Multimodal Variational Autoencoders for Semi-Supervised Learning: In Defense of Product-of-Experts",
+    "authors": [
+      "Svetlana Kutuzova",
+      "Oswin Krause",
+      "Douglas McCloskey",
+      "Mads Nielsen",
+      "Christian Igel"
+    ],
+    "emails": [
+      "~Oswin_Krause1",
+      "~Christian_Igel1",
+      "~Mads_Nielsen2",
+      "biosustain.dtu.dk"
+    ],
+    "rank": 2243
+  },
+  {
+    "url": "https://openreview.net/forum?id=AJTAcS7SZzf",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3
+    ],
+    "rating": "4.54",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Data sampling acts as a pivotal role in training deep learning models. However, an effective sampling schedule is difficult to learn due to its inherent high-dimension as a hyper-parameter. In this paper, we propose the AutoSampling method to automatically learn sampling schedules for model training, which consists of the multi-exploitation step aiming for optimal local sampling schedules and the exploration step for the ideal sampling distribution. More specifically, we achieve sampling schedule search with shortened exploitation cycle to provide enough supervision. In addition, we periodically estimate the sampling distribution from the learned sampling schedules and perturb it to search in the distribution space. The combination of two searches allows us to learn a robust sampling schedule. We apply our AutoSampling method to a variety of image classification tasks illustrating the effectiveness of the proposed method.",
+    "title": "AUTOSAMPLING: SEARCH FOR EFFECTIVE DATA SAMPLING SCHEDULES",
+    "authors": [
+      "Ming Sun",
+      "Haoxuan Dou",
+      "Baopu Li",
+      "Junjie Yan",
+      "Wanli Ouyang"
+    ],
+    "emails": [
+      "~Wanli_Ouyang1",
+      "~Junjie_Yan4",
+      "~Ming_Sun4",
+      "~Baopu_Li1",
+      "~Haoxuan_Dou1"
+    ],
+    "rank": 2244
+  },
+  {
+    "url": "https://openreview.net/forum?id=MJmYbFnJAGa",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.54",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "Federated learning is a challenging optimization problem due to the heterogeneity of the data across different clients. Such heterogeneity has been observed to induce \\emph{client drift} and significantly degrade the performance of algorithms designed for this setting.\nIn contrast, centralized learning with centrally collected data does not experience such drift, and has seen great empirical and theoretical progress with innovations such as momentum, adaptivity, etc.\nIn this work, we propose a general framework {\\sc Mime} which mitigates client-drift and adapts arbitrary centralized optimization algorithms (e.g. SGD, Adam, etc.) to federated learning.\n{\\sc Mime} uses a combination of \\emph{control-variates} and \\emph{server-level statistics} (e.g. momentum) at every client-update step to ensure that each local update mimics that of the centralized method. Our thorough theoretical and empirical analyses strongly establish \\mime's superiority over other baselines.",
+    "title": "Mime: Mimicking Centralized Stochastic Algorithms in Federated Learning",
+    "authors": [
+      "Sai Praneeth Karimireddy",
+      "Martin Jaggi",
+      "Satyen Kale",
+      "Mehryar Mohri",
+      "Sashank J. Reddi",
+      "Sebastian U Stich",
+      "Ananda Theertha Suresh"
+    ],
+    "emails": [
+      "~Sebastian_U_Stich1",
+      "~Martin_Jaggi1",
+      "~Ananda_Theertha_Suresh1",
+      "~Sai_Praneeth_Karimireddy1",
+      "~Mehryar_Mohri1",
+      "~Sashank_J._Reddi1",
+      "~Satyen_Kale2"
+    ],
+    "rank": 2245
+  },
+  {
+    "url": "https://openreview.net/forum?id=QZaeLBDU03",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.54",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "The field of cybersecurity has mostly been a cat-and-mouse game with the discovery of new attacks leading the way. To take away an attacker's advantage of reconnaissance, researchers have proposed proactive defense methods such as Moving Target Defense (MTD). To find good movement strategies, researchers have modeled MTD as leader-follower games between the defender and a cyber-adversary. We argue that existing models are inadequate in sequential settings when there is incomplete information about rational adversary and yield sub-optimal movement strategies. Further, while there exists an array of work on learning defense policies in sequential settings for cyber-security, they are either unpopular due to scalability issues arising out of incomplete information or tend to ignore the strategic nature of the adversary simplifying the scenario to use single-agent reinforcement learning techniques. To address these concerns, we propose (1) a unifying game-theoretic model, called the Bayesian Stackelberg Markov Games (BSMGs), that can model uncertainty over attacker types and the nuances of an MTD system and (2) a Bayesian Strong Stackelberg Q-learning (BSS-Q) approach that can, via interaction, learn the optimal movement policy for BSMGs within a reasonable time. We situate BSMGs in the landscape of incomplete-information Markov games and characterize the notion of Strong Stackelberg Equilibrium (SSE) in them. We show that our learning approach converges to an SSE of a BSMG and then highlight that the learned movement policy (1) improves the state-of-the-art in MTD for web-application security and (2) converges to an optimal policy in MTD domains with incomplete information about adversaries even when prior information about rewards and transitions is absent.",
+    "title": "Learning Movement Strategies for Moving Target Defense",
+    "authors": [
+      "Sailik Sengupta",
+      "Subbarao Kambhampati"
+    ],
+    "emails": [
+      "~Sailik_Sengupta1",
+      "~Subbarao_Kambhampati1"
+    ],
+    "rank": 2246
+  },
+  {
+    "url": "https://openreview.net/forum?id=o21sjfFaU1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.54",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Machine learning has demonstrated remarkable prediction accuracy over i.i.d data, but the accuracy often drops when tested with data from another distribution. One reason behind this accuracy drop is the reliance of models on the features that are only associated with the label in the training distribution, but not the test distribution. This problem is usually known as spurious correlation, confounding factors, or dataset bias. In this paper, we formally study the generalization error bound for this setup with the knowledge of how the spurious features are associated with the label. We also compare our analysis to the widely-accepted domain adaptation error bound and show that our bound can be tighter, with more assumptions that we consider realistic. Further, our analysis naturally offers a set of solutions for this problem, linked to established solutions in various topics about robustness in general, and these solutions all require some understandings of how the spurious features are associated with the label. Finally, we also briefly discuss a method that does not require such an understanding.",
+    "title": "Learning Robust Models by Countering Spurious Correlations",
+    "authors": [
+      "Haohan Wang",
+      "Zeyi Huang",
+      "Eric Xing"
+    ],
+    "emails": [
+      "~Eric_Xing1",
+      "~Haohan_Wang1",
+      "~Zeyi_Huang3"
+    ],
+    "rank": 2247
+  },
+  {
+    "url": "https://openreview.net/forum?id=-yo2vfTt_Cg",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.54",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "We investigate the use of regularized Newton methods with adaptive norms for optimizing neural networks. This approach can be seen as a second-order counterpart of adaptive gradient methods, which we here show to be interpretable as first-order trust region methods with ellipsoidal constraints. In particular, we prove that the preconditioning matrix used in RMSProp and Adam satisfies the necessary conditions for provable convergence of second-order trust region methods with standard worst-case complexities on general non-convex objectives. Furthermore, we run experiments across different neural architectures and datasets to find that the ellipsoidal constraints constantly outperform their spherical counterpart both in terms of number of backpropagations and asymptotic loss value. Finally, we find comparable performance to state-of-the-art first-order methods in terms of backpropagations, but further advances in hardware are needed to render Newton methods competitive in terms of computational time.",
+    "title": "Adaptive norms for deep learning with regularized Newton methods",
+    "authors": [
+      "Jonas K Kohler",
+      "Leonard Adolphs",
+      "Aurelien Lucchi"
+    ],
+    "emails": [
+      "~Jonas_K_Kohler1",
+      "~Aurelien_Lucchi1",
+      "~Leonard_Adolphs1"
+    ],
+    "rank": 2248
+  },
+  {
+    "url": "https://openreview.net/forum?id=mUU6jPGuvjx",
+    "ratings": [
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.54",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Compositional, structured models are appealing because they explicitly decompose problems and provide interpretable intermediate outputs that give confidence that the model is not simply latching onto data artifacts. Learning these models is challenging, however, because end-task supervision only provides a weak indirect signal on what values the latent decisions should take. This often results in the model failing to learn to perform the intermediate tasks correctly. In this work, we introduce a way to leverage paired examples that provide stronger cues for learning latent decisions. When two related training examples share internal substructure, we add an additional training objective to encourage consistency between their latent decisions. Such an objective does not require external supervision for the values of the latent output, or even the end task, yet provides an additional training signal to that provided by individual training examples themselves. We apply our method to improve compositional question answering using neural module networks on the DROP dataset. We explore three ways to acquire paired questions in DROP: (a) discovering naturally occurring paired examples within the dataset, (b) constructing paired examples using templates, and (c) generating paired examples using a question generation model. We empirically demonstrate that our proposed approach improves both in- and out-of-distribution generalization and leads to correct latent decision predictions.",
+    "title": "Paired Examples as Indirect Supervision in Latent Decision Models",
+    "authors": [
+      "Nitish Gupta",
+      "Sameer Singh",
+      "Matt Gardner",
+      "Dan Roth"
+    ],
+    "emails": [
+      "~Sameer_Singh1",
+      "~Nitish_Gupta1",
+      "~Matt_Gardner1",
+      "~Dan_Roth3"
+    ],
+    "rank": 2249
+  },
+  {
+    "url": "https://openreview.net/forum?id=PDLPdWHdp-h",
+    "ratings": [
+      7,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.54",
+    "confidences": [
+      3,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Adversarial  vulnerability  is  a  fundamental  limitation  of  deep  neural  networks which remains poorly understood.  Recent work suggests that adversarial attacks on  deep  neural  network  classifiers  exploit  the  fact  that  non-robust  models  rely on superficial statistics to form predictions. While such features are semantically meaningless, they are strongly predictive of the input\u2019s label, allowing non-robust networks to achieve good generalization on unperturbed test inputs. However, this hypothesis fails to explain why autoencoders are also vulnerable to adversarial attacks, despite achieving low reconstruction error on clean inputs.  We show that training an autoencoder on adversarial input-target pairs leads to low reconstruction error on the standard test set, suggesting that adversarial attacks on autoencoders are predictive.  In this work, we study the predictive power of adversarial examples on autoencoders through the lens of compressive sensing. We characterize the relationship between adversarial perturbations and target inputs and reveal that training autoencoders on adversarial input-target pairs is a form of knowledge distillation, achieved by learning to attenuate structured noise.",
+    "title": "Understanding Adversarial Attacks on Autoencoders",
+    "authors": [
+      "Elsa Riachi",
+      "Frank Rudzicz"
+    ],
+    "emails": [
+      "~Frank_Rudzicz2",
+      "~Elsa_Riachi1"
+    ],
+    "rank": 2250
+  },
+  {
+    "url": "https://openreview.net/forum?id=uB5x7Y2qsFR",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Advances in semi-supervised methods for image classification  significantly boosted performance in the learning with noisy labels (LNL) task. Specifically, by discarding the erroneous labels (and keeping the samples), the LNL task becomes a semi-supervised one for which powerful tools exist.  Identifying the noisy samples, however, heavily relies on the success of a warm-up stage where standard supervised training is performed using the full (noisy) training set. This stage is sensitive not only to the noise level but also to the choice of hyperparameters. \nIn this paper, we propose to solve this problem by utilizing self-supervised pre-training. Our approach, which we name Contrast to Divide,  offers several important advantages. First, by removing the labels altogether, our pre-trained features become agnostic to the labels' amount of noise, allowing accurate noisy separation even under high noise levels. Second, as recently shown, semi-supervised methods significantly benefit from self-supervised pre-training. Moreover, compared with standard pre-training approaches (e.g., supervised training on ImageNet), self-supervised pre-training does not suffer from a domain gap.\nWe demonstrate the effectiveness of the proposed method in various settings with both synthetic and real noise. Our results indicate that Contrast to Divide brings a new state-of-the-art by a significant margin to both CIFAR-10 and CIFAR-100. For example, in the high-noise regime of 90%, we get a boost of more than 27% for CIFAR-100 and more than 17% for CIFAR-10 over the previous state-of-the-art. Moreover, we achieve comparable performance on Clothing-1M without using ImageNet pre-training. Code for reproducing our experiments is available at <a href=\"https://github.com/ContrastToDivide/C2D\" target=\"_blank\" rel=\"nofollow\">https://github.com/ContrastToDivide/C2D</a>",
+    "title": "Contrast to Divide: self-supervised pre-training for learning with noisy labels",
+    "authors": [
+      "Evgenii Zheltonozhskii",
+      "Chaim Baskin",
+      "Avi Mendelson",
+      "Alex M. Bronstein",
+      "Or Litany"
+    ],
+    "emails": [
+      "~Avi_Mendelson1",
+      "~Chaim_Baskin1",
+      "~Alex_M._Bronstein1",
+      "~Evgenii_Zheltonozhskii1",
+      "~Or_Litany1"
+    ],
+    "rank": 2251
+  },
+  {
+    "url": "https://openreview.net/forum?id=P63SQE0fVa",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.53",
+    "confidences": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Combinatorial Optimization (CO) problems are theoretically challenging yet crucial in practice. Numerous works used Reinforcement Learning (RL) to tackle these CO problems. As current approaches mainly focus on single-worker CO problems such as the famous Travelling Salesman Problem (TSP), we focus on more practical extension of TSP to multi-worker (salesmen) setting, specifically MinMax mTSP. From the RL perspective, Minmax mTSP raises several significant challenges, such as the cooperation of multiple workers and the need for a well-engineered reward function. In this paper, we present the RL framework with (1) worker-task heterograph and type-aware Graph Neural Network, and (2) the RL training method that is stable, has fast convergence speed, and directly optimizes the objective of MinMax mTSP in a delayed reward setting. We achieve comparable performance to a highly optimized meta-heuristic baseline, OR-Tools, and outperforms it in 10% of the cases, both on in-training and out-of-training problem distributions. Moreover, our problem formulation enables us to solve problems with any number of salesmen (workers) and cities.",
+    "title": "ScheduleNet: Learn to Solve MinMax mTSP Using Reinforcement Learning with Delayed Reward",
+    "authors": [
+      "Junyoung Park",
+      "Sanzhar Bakhtiyarov",
+      "Jinkyoo Park"
+    ],
+    "emails": [
+      "~Sanzhar_Bakhtiyarov1",
+      "~Jinkyoo_Park1",
+      "~Junyoung_Park1"
+    ],
+    "rank": 2252
+  },
+  {
+    "url": "https://openreview.net/forum?id=zsKWh2pRSBK",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      5,
+      2
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Under a commonly-studied \u201cbackdoor\u201d poisoning attack against classification models, an attacker adds a small \u201ctrigger\u201d to a subset of the training data, such that the presence of this trigger at test time causes the classifier to always predict some target class. It is often implicitly assumed that the poisoned classifier is vulnerable exclusively to the adversary who possesses the trigger. In this paper, we show empirically that this view of backdoored classifiers is fundamentally incorrect. We demonstrate that anyone with access to the classifier, even without access to any original training data or trigger, can construct several alternative triggers that are as effective or more so at eliciting the target class at test time. We construct these alternative triggers by first generating adversarial examples for a smoothed version of the classifier, created with a recent process called Denoised Smoothing, and then extracting colors or cropped portions of adversarial images. We demonstrate the effectiveness of our attack through extensive experiments on ImageNet and TrojAI datasets, including a user study which demonstrates that our method allows users to easily determine the existence of such backdoors in existing poisoned classifiers. Furthermore, we demonstrate that our alternative triggers can in fact look entirely different from the original trigger, highlighting that the backdoor actually learned by the classifier differs substantially from the trigger image itself. Thus, we argue that there is no such thing as a \u201csecret\u201d backdoor in poisoned classifiers: poisoning a classifier invites attacks not just by the party that possesses the trigger, but from anyone with access to the classifier.",
+    "title": "Poisoned classifiers are not only backdoored, they are fundamentally broken",
+    "authors": [
+      "Mingjie Sun",
+      "Siddhant Agarwal",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Mingjie_Sun1",
+      "~J_Zico_Kolter1",
+      "~Siddhant_Agarwal1"
+    ],
+    "rank": 2253
+  },
+  {
+    "url": "https://openreview.net/forum?id=bhKQ7P7gyLA",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We apply concepts from manifold regularization to develop new regularization techniques for training locally stable deep neural networks. Our regularizers encourage functions which are smooth not only in their predictions but also their decision boundaries. Empirically, our networks exhibit stability in a diverse set of perturbation models, including <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container>, and Wasserstein-based perturbations; in particular, against a state-of-the-art PGD adversary, a single model achieves both <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> robustness of 40% at <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi><mo>=</mo><mn>8</mn><mrow><mo>/</mo></mrow><mn>255</mn></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> robustness of 48% at <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi><mo>=</mo><mn>1.0</mn></math></mjx-assistive-mml></mjx-container> on CIFAR-10. We also obtain state-of-the-art verified accuracy of 21% in the same <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> setting. Furthermore, our techniques are efficient, incurring overhead on par with two additional parallel forward passes through the network; in the case of CIFAR-10, we achieve our results after training for only 3 hours, compared to more than 70 hours for standard adversarial training.",
+    "title": "Manifold Regularization for Locally Stable Deep Neural Networks",
+    "authors": [
+      "Charles Jin",
+      "Martin Rinard"
+    ],
+    "emails": [
+      "~Charles_Jin1",
+      "~Martin_Rinard1"
+    ],
+    "rank": 2254
+  },
+  {
+    "url": "https://openreview.net/forum?id=fdZvTFn8Yq",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transfer and meta-learning algorithms leverage evaluations on related tasks in order to significantly speed up learning or optimization on a new problem. For applications that depend on uncertainty estimates, e.g., in Bayesian optimization, recent probabilistic approaches have shown good performance at test time, but either scale poorly with the number of data points or under-perform with little data on the test task. In this paper, we propose a novel approach to probabilistic transfer learning that uses a generative model for the underlying data distribution and simultaneously learns a latent feature distribution to represent unknown task properties. To enable fast and accurate inference at test-time, we introduce a novel meta-loss that structures the latent space to match the prior used for inference. Together, these contributions ensure that our probabilistic model exhibits high sample-efficiency and provides well-calibrated uncertainty estimates. We evaluate the proposed approach and compare its performance to probabilistic models from the literature on a set of Bayesian optimization transfer-learning tasks. ",
+    "title": "Probabilistic Meta-Learning for Bayesian Optimization",
+    "authors": [
+      "Felix Berkenkamp",
+      "Anna Eivazi",
+      "Lukas Grossberger",
+      "Kathrin Skubch",
+      "Jonathan Spitz",
+      "Christian Daniel",
+      "Stefan Falkner"
+    ],
+    "emails": [
+      "~Christian_Daniel1",
+      "de.bosch.com",
+      "~Stefan_Falkner1",
+      "~Felix_Berkenkamp1",
+      "gmail.com",
+      "~Lukas_Grossberger1"
+    ],
+    "rank": 2255
+  },
+  {
+    "url": "https://openreview.net/forum?id=4Un_FnHiN8C",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "In this paper, we explore an alternate method for synthesizing neural network architectures, inspired by the brain's stochastic synaptic pruning. During a person\u2019s lifetime, numerous distinct neuronal architectures are responsible for performing the same tasks. This indicates that biological neural networks are, to some degree, architecture agnostic. However, artificial networks rely on their fine-tuned weights and hand-crafted architectures for their remarkable performance. This contrast begs the question: Can we build artificial architecture agnostic neural networks? To ground this study we utilize sparse, binary neural networks that parallel the brain\u2019s circuits. Within this sparse, binary paradigm we sample many binary architectures to create families of architecture agnostic neural networks not trained via backpropagation. These high-performing network families share the same sparsity, distribution of binary weights, and succeed in both static and dynamic tasks. In summation, we create an architecture manifold search procedure to discover families of architecture agnostic neural networks.",
+    "title": "Architecture Agnostic Neural Networks",
+    "authors": [
+      "Sabera J Talukder",
+      "Guruprasad Raghavan",
+      "Yisong Yue"
+    ],
+    "emails": [
+      "~Sabera_J_Talukder1",
+      "~Yisong_Yue1",
+      "~Guruprasad_Raghavan1"
+    ],
+    "rank": 2256
+  },
+  {
+    "url": "https://openreview.net/forum?id=XEw5Onu69uu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "To be able to predict a molecular graph structure (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>W</mi></math></mjx-assistive-mml></mjx-container>) given a 2D image of a chemical compound (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D448 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>U</mi></math></mjx-assistive-mml></mjx-container>)  is a challenging problem in machine learning. We are interested to learn <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3A\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D448 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi><mo>:</mo><mi>U</mi><mo stretchy=\"false\">\u2192</mo><mi>W</mi></math></mjx-assistive-mml></mjx-container> where we have a fully mediating representation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D449 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>V</mi></math></mjx-assistive-mml></mjx-container> such that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container> factors into <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D448 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D449 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>U</mi><mo stretchy=\"false\">\u2192</mo><mi>V</mi><mo stretchy=\"false\">\u2192</mo><mi>W</mi></math></mjx-assistive-mml></mjx-container>. However, observing V requires detailed and expensive labels. We propose \\textbf{graph aligning} approach that generates rich or detailed labels given normal labels <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>W</mi></math></mjx-assistive-mml></mjx-container>. In this paper we investigate the scenario of domain adaptation from the source domain where we have access to the expensive labels <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D449 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>V</mi></math></mjx-assistive-mml></mjx-container> to the target domain where only normal labels W are available. Focusing on the problem of predicting chemical compound graphs from 2D images the fully mediating layer is represented using the planar embedding of the chemical graph structure we are predicting. The use of a fully mediating layer implies some assumptions on the mechanism of the underlying process. However if the assumptions are correct it should allow the machine learning model to be more interpretable, generalize better and be more  data efficient at training time.\nThe empirical results show that, using only 4000 data points, we obtain up to 4x improvement of performance after domain adaptation to target domain compared to pretrained model only on the source domain. After domain adaptation, the model is even able to detect atom types that were never seen in the original source domain. Finally, on the Maybridge data set the proposed self-labeling approach reached higher performance than the current state of the art.",
+    "title": "Self-Labeling of Fully Mediating Representations by Graph Alignment",
+    "authors": [
+      "Martijn Oldenhof",
+      "Adam Arany",
+      "Yves Moreau",
+      "Jaak Simm"
+    ],
+    "emails": [
+      "~Adam_Arany1",
+      "~Jaak_Simm1",
+      "~Martijn_Oldenhof1",
+      "~Yves_Moreau2"
+    ],
+    "rank": 2257
+  },
+  {
+    "url": "https://openreview.net/forum?id=kKwFlM32HV5",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "abstract": "We introduce a novel and adaptive batch-wise regularization based on the proposed Batch Confusion Norm (BCN) to flexibly address the natural world distribution which usually involves fine-grained and long-tailed properties at the same time. The Fine-Grained Visual Classification (FGVC) problem is notably characterized by two intriguing properties, significant inter-class similarity and intra-class variations, which cause learning an effective FGVC classifier a challenging task. Existing techniques attempt to capture the discriminative parts by their modified attention mechanism. The long-tailed distribution of visual classification poses a great challenge for handling the class imbalance problem. Most of existing solutions usually focus on the class-balancing strategies, classifier normalization, or alleviating the negative gradient of tailed categories. Depart from the conventional approaches, we propose to tackle both problems simultaneously with the adaptive confusion concept. When inter-class similarity prevails in a batch, the BCN term can alleviate possible overfitting due to exploring image features of fine details. On the other hand, when inter-class similarity is not an issue, the class predictions from different samples would unavoidably yield a substantial BCN loss, and prompt the network learning to further reduce the cross-entropy loss. More importantly, extending the existing confusion energy-based framework to account for long-tailed scenario, BCN can learn to exert proper distribution of confusion strength over tailed and head categories to improve classification performance. While the resulting FGVC model by the BCN technique is effective, the performance can be consistently boosted by incorporating extra attention mechanism. In our experiments, we have obtained state-of-the-art results on several benchmark FGVC datasets, and also demonstrated that our approach is competitive on the popular natural world distribution dataset, iNaturalist2018. ",
+    "title": "Natural World Distribution via Adaptive Confusion Energy Regularization",
+    "authors": [
+      "Yen-Chi Hsu",
+      "Cheng-Yao Hong",
+      "Wan-Cyuan Fan",
+      "Ding-Jie Chen",
+      "Ming-Sui Lee",
+      "davi geiger",
+      "Tyng-Luh Liu"
+    ],
+    "emails": [
+      "~Ding-Jie_Chen1",
+      "~Yen-Chi_Hsu1",
+      "~Cheng-Yao_Hong2",
+      "~Ming-Sui_Lee1",
+      "~Wan-Cyuan_Fan1",
+      "~davi_geiger1",
+      "~Tyng-Luh_Liu1"
+    ],
+    "rank": 2258
+  },
+  {
+    "url": "https://openreview.net/forum?id=BMua55nUyyt",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      7,
+      4,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Privacy-preserving data analysis becomes prevailing in recent years. It is a common sense in privacy literature that strict differential privacy can only be obtained by imposing additional randomness in the algorithm. In this paper, we study the problem of private sign recovery for sparse mean estimation and sparse linear regression in a distributed setup. By taking a coordinate-wise median among the reported local sign vectors, which can be referred to as a median divide-and-conquer (Med-DC) approach, we can recover the signs of the true parameter with a provable consistency guarantee. Moreover, without adding any extra randomness to the algorithm, our Med-DC method can protect data privacy with high probability. Simulation studies are conducted to demonstrate the effectiveness of our proposed method.",
+    "title": "Median DC for Sign Recovery: Privacy can be Achieved by Deterministic Algorithms",
+    "authors": [
+      "Jiyuan Tu",
+      "Weidong Liu",
+      "Xiaojun Mao"
+    ],
+    "emails": [
+      "~Xiaojun_Mao1",
+      "~Jiyuan_Tu1",
+      "~Weidong_Liu2"
+    ],
+    "rank": 2259
+  },
+  {
+    "url": "https://openreview.net/forum?id=MBIy8WLgsw",
+    "ratings": [
+      5,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "abstract": "An essential step in the task of model selection, such as hyper-parameter optimization (HPO) or neural architecture search (NAS), is the process of estimating a candidate model's (hyper-parameter or architecture) performance. Due to the high computational cost of training models until full convergence, it is necessary to develop efficient methods that can accurately estimate a model's best performance using only a small time budget. To this end, we propose a novel performance estimation method which uses a history of model features observed during the early stages of training to obtain an estimate of final performance. Our method is versatile. It can be combined with different search algorithms and applied to various configuration spaces in HPO and NAS. Using a sampling-based search algorithm and parallel computing, our method can find an architecture which is better than DARTS and with an 80\\% reduction in search time.",
+    "title": "Efficient Model Performance Estimation via Feature Histories",
+    "authors": [
+      "Shengcao Cao",
+      "Xiaofang Wang",
+      "Kris M. Kitani"
+    ],
+    "emails": [
+      "~Kris_M._Kitani1",
+      "~Xiaofang_Wang1",
+      "~Shengcao_Cao1"
+    ],
+    "rank": 2260
+  },
+  {
+    "url": "https://openreview.net/forum?id=3Jldbtfqfa",
+    "ratings": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Knowledge distillation is a critical technique to transfer knowledge between models, typically from a large model (the teacher) to a smaller one (the student). The objective function of knowledge distillation is typically the cross-entropy between the teacher and the student's output distributions. However, for structured prediction problems, the output space is exponential in size; therefore, the cross-entropy objective becomes intractable to compute and optimize directly. In this paper, we derive a factorized form of the knowledge distillation objective for structured prediction, which is tractable for many typical choices of the teacher and student models. In particular, we show the tractability and empirical effectiveness of structural knowledge distillation between sequence labeling and dependency parsing models under four different scenarios: 1) the teacher and student share the same factorization form of the output structure scoring function; 2) the student factorization produces smaller substructures than the teacher factorization; 3) the teacher factorization produces smaller substructures than the student factorization; 4) the factorization forms from the teacher and the student are incompatible.",
+    "title": "Structural Knowledge Distillation",
+    "authors": [
+      "Xinyu Wang",
+      "Yong Jiang",
+      "Zhaohui Yan",
+      "Zixia Jia",
+      "Nguyen Bach",
+      "Tao Wang",
+      "Zhongqiang Huang",
+      "Fei Huang",
+      "Kewei Tu"
+    ],
+    "emails": [
+      "~Zhongqiang_Huang1",
+      "~Kewei_Tu1",
+      "~Zixia_Jia1",
+      "~Tao_Wang4",
+      "~Yong_Jiang1",
+      "~Nguyen_Bach1",
+      "~Zhaohui_Yan1",
+      "~Fei_Huang2",
+      "~Xinyu_Wang3"
+    ],
+    "rank": 2261
+  },
+  {
+    "url": "https://openreview.net/forum?id=VG3i3CfFN__",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      7,
+      3
+    ],
+    "rating": "4.53",
+    "confidences": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Semantic parsing is a challenging task whose purpose is to convert a natural language utterance to machine-understandable information representation. Recently, solutions using Neural Machine Translation have achieved many promising results, especially Transformer because of the ability to learn long-range word dependencies. However, the one drawback of adapting the original Transformer to the semantic parsing is the lack of detail in expressing the information of sentences. Therefore, this work proposes a PhraseTransformer architecture that is capable of a more detailed meaning representation by learning the phrase dependencies in the sentence. The main idea is to incorporate Long Short-Term Memory (LSTM) into the Self-Attention mechanism of the original Transformer to capture more local context of phrases. Experimental results show that the proposed model captures the detailed meaning better than Transformer, raises local context awareness and achieves strong competitive performance on Geo, MSParS datasets, and leads to SOTA performance on Atis dataset in methods using Neural Network.",
+    "title": "PhraseTransformer: Self-Attention using Local Context for Semantic Parsing",
+    "authors": [
+      "Phuong Minh Nguyen",
+      "Vu Tran",
+      "Minh Le Nguyen"
+    ],
+    "emails": [
+      "~Phuong_Minh_Nguyen3",
+      "jaist.ac.jp"
+    ],
+    "rank": 2262
+  },
+  {
+    "url": "https://openreview.net/forum?id=mxIEptSTK6Z",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Continual learning is a concept of online learning along with multiple sequential tasks. One of the critical barriers of continual learning is that a network should learn a new task keeping the knowledge of old tasks without access to any data of the old tasks. In this paper, we propose a neuron importance based regularization method for stable continual learning.\nWe propose a comprehensive experimental evaluation framework on existing benchmark data sets to evaluate not just the accuracy of a certain order of continual learning performance also the robustness of the accuracy along with the changes in the order of tasks.",
+    "title": "Continual learning with neural activation importance",
+    "authors": [
+      "Sohee Kim",
+      "Seungkyu Lee"
+    ],
+    "emails": [
+      "~Seungkyu_Lee1",
+      "khu.ac.kr"
+    ],
+    "rank": 2263
+  },
+  {
+    "url": "https://openreview.net/forum?id=0_YzHnuthDf",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We consider the domain generalization problem, where the test domain differs from the training domain. For deep neural networks, we show that the batch normalization layer is a highly unstable component under such domain shifts, and we identify two sources for its instability. Based on this observation, we propose a new learning formulation that can learn robust neural networks so that the corresponding batch normalization layers are invariant under domain shifts. Experimental results on three standard domain generalization benchmarks demonstrate that our method can learn neural network models with significantly more stable batch normalization layers on unseen domains, and the improved stability leads to superior generalization performances.",
+    "title": "Invariant Batch Normalization for Multi-source Domain Generalization",
+    "authors": [
+      "Qing LIAN",
+      "LIN Yong",
+      "Tong Zhang"
+    ],
+    "emails": [
+      "~Qing_LIAN3",
+      "~Tong_Zhang2",
+      "~LIN_Yong1"
+    ],
+    "rank": 2264
+  },
+  {
+    "url": "https://openreview.net/forum?id=_ojjh-QFiFr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We present Language-mediated, Object-centric Representation Learning (LORL), learning disentangled, object-centric scene representations from vision and language. LORL builds upon recent advances in unsupervised object segmentation, notably MONet and Slot Attention. Just like these algorithms, LORL also learns an object-centric representation by reconstructing the input image. But LORL further learns to associate the learned representations to concepts, i.e., words for object categories, properties, and spatial relationships, from language input. These object-centric concepts derived from language facilitate the learning of object-centric representations. LORL can be integrated with various unsupervised segmentation algorithms that are language-agnostic. Experiments show that LORL consistently improves the performance of MONet and Slot Attention on two datasets via the help of language. We also show that concepts learned by LORL aid downstream tasks such as referential expression interpretation.",
+    "title": "Language-Mediated, Object-Centric Representation Learning",
+    "authors": [
+      "Ruocheng Wang",
+      "Jiayuan Mao",
+      "Samuel Gershman",
+      "Jiajun Wu"
+    ],
+    "emails": [
+      "~Jiajun_Wu1",
+      "~Jiayuan_Mao1",
+      "~Ruocheng_Wang2",
+      "~Samuel_Gershman1"
+    ],
+    "rank": 2265
+  },
+  {
+    "url": "https://openreview.net/forum?id=sojnduJtbfQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Do all adversarial examples have the same consequences? An autonomous driving system misclassifying a pedestrian as a car may induce a far more dangerous --and even potentially lethal-- behavior than, for instance, a car as a bus. In order to better tackle this important problematic, we introduce the concept of hierarchical adversarial robustness. Given a dataset whose classes can be grouped into coarse-level labels, we define hierarchical adversarial examples as the ones leading to a misclassification at the coarse level. To improve the resistance of neural networks to hierarchical attacks, we introduce a hierarchical adversarially robust (HAR) network design that decomposes a single classification task into one coarse and multiple fine classification tasks, before being specifically trained by adversarial defense techniques. As an alternative to an end-to-end learning approach, we show that HAR significantly improves the robustness of the network against <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>  and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mrow><mn>2</mn></mrow></msub></math></mjx-assistive-mml></mjx-container>bounded hierarchical attacks on CIFAR-100.",
+    "title": "Improving Hierarchical Adversarial Robustness of Deep Neural Networks",
+    "authors": [
+      "Avery Ma",
+      "Aladin Virmaux",
+      "Kevin Scaman",
+      "Juwei Lu"
+    ],
+    "emails": [
+      "~Avery_Ma1",
+      "~Kevin_Scaman1",
+      "~Aladin_Virmaux1",
+      "~Juwei_Lu2"
+    ],
+    "rank": 2266
+  },
+  {
+    "url": "https://openreview.net/forum?id=5qK0RActG1x",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.53",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Deep learning interpretability tools, such as (Bau et al., 2017; Ribeiro et al., 2016; Smilkov et al., 2017), have been proposed to explain and visualize the ways that deep neural networks make predictions. The success of these methods highly relies on human subjective interpretations, i.e., the ground truth of interpretations, such as feature importance ranking or locations of visual objects, when evaluating the interpretability of the deep models on a speci\ufb01c task. For tasks that the ground truth of interpretations is not available, we propose a novel framework Consensus incorporating an ensemble of deep models as the committee for interpretability evaluation. Given any task/dataset, Consensus \ufb01rst obtains the interpretation results using existing tools, e.g., LIME (Ribeiro et al., 2016), for every model in the committee, then aggregates the results from the entire committee and approximates the \u201cground truth\u201d of interpretations through voting. With such approximated ground truth, Consensus evaluates the interpretability of a model through matching its interpretation result and the approximated one, and ranks the matching scores together with committee members, so as to pursue the absolute and relative interpretability evaluation results. We carry out extensive experiments to validate Consensus on various datasets. The results show that Consensus can precisely identify the interpretability for a wide range of models on ubiquitous datasets that the ground truth is not available. Robustness analyses further demonstrate the advantage of the proposed framework to reach the consensus of interpretations through simple voting and evaluate the interpretability of deep models. Through the proposed Consensus framework, the interpretability evaluation has been democratized without the need of ground truth as criterion.",
+    "title": "Democratizing Evaluation of Deep Model Interpretability through Consensus",
+    "authors": [
+      "Xuhong Li",
+      "Haoyi Xiong",
+      "Siyu Huang",
+      "Shilei Ji",
+      "Yanjie Fu",
+      "Dejing Dou"
+    ],
+    "emails": [
+      "baidu.com",
+      "~Haoyi_Xiong1",
+      "~Yanjie_Fu2",
+      "~Dejing_Dou1",
+      "~Xuhong_Li3",
+      "~Siyu_Huang2"
+    ],
+    "rank": 2267
+  },
+  {
+    "url": "https://openreview.net/forum?id=Au1gNqq4brw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Modern recurrent neural networks (RNN) such as Gated Recurrent Units (GRU) and Long Short-term Memory (LSTM) have demonstrated impressive results on tasks involving sequential data in practice. Despite continuous efforts on interpreting their behaviors, the exact mechanism underlying their successes in capturing sequence-level information have not been thoroughly understood. In this work, we present a study on understanding the essential features captured by GRU/LSTM cells by mathematically expanding and unrolling the hidden states. Based on the expanded and unrolled hidden states, we find there was a type of sequence-level representations brought in by the gating mechanism, which enables the cells to encode sequence-level features along with token-level features. Specifically, we show that the cells would consist of such sequence-level features similar to those of N-grams. Based on such a finding, we also found that replacing the hidden states of the standard cells with N-gram representations does not necessarily degrade performance on the sentiment analysis and language modeling tasks, indicating such features may play a significant role for GRU/LSTM cells.",
+    "title": "SEQUENCE-LEVEL FEATURES: HOW GRU AND LSTM CELLS CAPTURE N-GRAMS",
+    "authors": [
+      "Xiaobing Sun",
+      "Wei Lu"
+    ],
+    "emails": [
+      "~Xiaobing_Sun1",
+      "sutd.edu.sg"
+    ],
+    "rank": 2268
+  },
+  {
+    "url": "https://openreview.net/forum?id=aIg2i1IKv0w",
+    "ratings": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "A fundamental challenge in artificial intelligence is learning useful representations of data that yield good performance on a downstream classification task, without overfitting to spurious input features. Extracting task-relevant predictive information becomes particularly challenging for high-dimensional, noisy, real-world data. We propose Contrastive Input Morphing (CIM), a representation learning framework that learns input-space transformations of the data to mitigate the effect of irrelevant input features on downstream performance via a triplet loss. Empirically, we demonstrate the efficacy of our approach on various tasks which typically suffer from the presence of spurious correlations, and show that CIM improves the performance of other representation learning methods such as variational information bottleneck (VIB) when used in conjunction.",
+    "title": "Learning Task-Relevant Features via Contrastive Input Morphing",
+    "authors": [
+      "Saeid Asgari",
+      "Kristy Choi",
+      "Amir Hosein Khasahmadi",
+      "Anirudh Goyal"
+    ],
+    "emails": [
+      "~Anirudh_Goyal1",
+      "~Kristy_Choi1",
+      "~Saeid_Asgari1",
+      "~Amir_Hosein_Khasahmadi1"
+    ],
+    "rank": 2269
+  },
+  {
+    "url": "https://openreview.net/forum?id=6Lhv4x2_9pw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "I present a simple but informative approach to gain insight into the Bayesian neural network (BNN) trained parameters. I used 2000 dynamic rupture simulations to train a BNN model to predict if an earthquake can break through a simple 2D fault. In each simulation, fault geometry, stress conditions, and friction parameters vary. The trained BNN parameters show that the network learns the physics of earthquake rupture. Neurons with high positive weights contribute to the earthquake rupture and vice versa.  The results show that the stress condition of the fault plays a critical role in determining its strength. The stress is also the top source of uncertainty, followed by the dynamic friction coefficient. When stress and friction drop of a fault have higher value and are combined with higher weighted neurons, the prediction score increases, thus fault likely to be ruptured. Fault's width and height have the least amount of uncertainty, which may not be correct in a real scenario. The study shows that the potentiality of  BNN that provides data patterns about rupture physics to make an additional information source for scientists studying the earthquake rupture.",
+    "title": "Bayesian neural network parameters provide insights into the earthquake rupture physics.",
+    "authors": [
+      "Sabber Ahamed"
+    ],
+    "emails": [
+      "~Sabber_Ahamed1"
+    ],
+    "rank": 2270
+  },
+  {
+    "url": "https://openreview.net/forum?id=c1zLYtHYyQG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      5,
+      4,
+      2,
+      3
+    ],
+    "abstract": "Traditional reinforcement learning methods usually deal with the tasks with explicit reward signals. However, for vast majority of cases, the environment wouldn't feedback a reward signal immediately. It turns out to be a bottleneck for modern reinforcement learning approaches to be applied into more realistic scenarios. Recently, inverse reinforcement learning has made great progress in making full use of the expert demonstrations to recover the reward signal for reinforcement learning. And generative adversarial imitation learning is one promising approach. In this paper, we propose a new architecture for training generative adversarial imitation learning which is so called energy based generative adversarial imitation learning (EB-GAIL). It views the discriminator as an energy function that attributes low energies to the regions near the expert demonstrations and high energies to other regions. Therefore, a generator can be seen as a reinforcement learning procedure to sample trajectories with minimal energies (cost), while the discriminator is trained to assign high energies to these generated trajectories. In detail, EB-GAIL uses an auto-encoder architecture in place of the discriminator, with the energy being the reconstruction error. Theoretical analysis shows our EB-GAIL could match the occupancy measure with expert policy during the training process. Meanwhile, the experiments depict that EB-GAIL outperforms other SoTA methods while the training process for EB-GAIL can be more stable.",
+    "title": "Learning from Demonstrations with Energy based Generative Adversarial Imitation Learning",
+    "authors": [
+      "Kaifeng Zhang"
+    ],
+    "emails": [
+      "~Kaifeng_Zhang1"
+    ],
+    "rank": 2271
+  },
+  {
+    "url": "https://openreview.net/forum?id=PoP96DrBHnl",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Off-policy algorithms, in which a behavior policy differs from the target policy and is used to gain experience for learning, have proven to be of great practical value in reinforcement learning. However, even for simple convex problems such as linear value function approximation, these algorithms are not guaranteed to be stable. To address this, alternative algorithms that are provably convergent in such cases have been introduced, the most well known being gradient descent temporal difference (GTD) learning. This algorithm and others like it, however, tend to converge much more slowly than conventional temporal difference learning.\nIn this paper we propose gradient descent temporal difference-difference (Gradient-DD) learning in order to accelerate GTD learning by introducing second-order differences in successive parameter updates.\nWe investigate this algorithm in the framework of linear value function approximation and analytically showing its improvement over GTD learning. Studying the model empirically on the random walk and Boyan-chain prediction tasks, we find substantial improvement over GTD learning and, in several cases, better performance even than conventional TD learning.\n",
+    "title": "Gradient descent temporal difference-difference learning",
+    "authors": [
+      "Rong Zhu",
+      "James Murray"
+    ],
+    "emails": [
+      "~Rong_Zhu4",
+      "uoregon.edu"
+    ],
+    "rank": 2272
+  },
+  {
+    "url": "https://openreview.net/forum?id=3teh9zI0j4L",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      6,
+      3
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The exposure bias problem refers to the incrementally distorted generation induced by the training-generation discrepancy, in teacher-forcing training for auto-regressive neural network language models (LM). It has been regarded as a central problem for LMs trained for open-ended language generation. Although a lot of algorithms have been proposed to avoid teacher forcing and therefore alleviate exposure bias, there is little work showing how serious the exposure bias problem actually is. In this work, we propose novel metrics to quantify the impact of exposure bias in the generation of MLE-trained LMs. Our key intuition is that if we feed ground-truth data prefixes (instead of prefixes generated by the model itself) into the model and ask it to continue the generation, the performance should become much better because the training-generation discrepancy in the prefix is removed. We conduct both automatic and human evaluation in our experiments, and our observations are two-fold: (1) We confirm that the prefix discrepancy indeed induces some level of performance loss. (2) However, the induced distortion seems to be limited, and is not incremental during the generation, which contradicts the claim of exposure bias.",
+    "title": "Quantifying Exposure Bias for Open-ended Language Generation",
+    "authors": [
+      "Tianxing He",
+      "Jingzhao Zhang",
+      "Zhiming Zhou",
+      "James R. Glass"
+    ],
+    "emails": [
+      "~Tianxing_He1",
+      "~James_R._Glass1",
+      "~Jingzhao_Zhang2",
+      "~Zhiming_Zhou2"
+    ],
+    "rank": 2273
+  },
+  {
+    "url": "https://openreview.net/forum?id=0WWj8muw_rj",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      4,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Adaptive gradient methods have been shown to outperform SGD in many tasks of training neural networks. However, the acceleration effect is yet to be explained in the non-convex setting since the best convergence rate of adaptive gradient methods is worse than that of SGD in literature. In this paper, we prove that adaptive gradient methods exhibit an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle size=\"sm\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.215em; margin-bottom: -0.215em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7E\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.056em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle mathsize=\"0.85em\"><mrow><mover><mi>O</mi><mo stretchy=\"false\">~</mo></mover></mrow><mo stretchy=\"false\">(</mo><msup><mi>T</mi><mrow><mo>\u2212</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>2</mn></mrow></msup><mo stretchy=\"false\">)</mo></mstyle></math></mjx-assistive-mml></mjx-container>-convergence rate for finding first-order stationary points under the strong growth condition, which improves previous best convergence results of adaptive gradient methods and random shuffling SGD by factors of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle size=\"sm\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.056em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle mathsize=\"0.85em\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>T</mi><mrow><mo>\u2212</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>4</mn></mrow></msup><mo stretchy=\"false\">)</mo></mstyle></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle size=\"sm\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.056em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle mathsize=\"0.85em\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mi>T</mi><mrow><mo>\u2212</mo><mn>1</mn><mrow><mo>/</mo></mrow><mn>6</mn></mrow></msup><mo stretchy=\"false\">)</mo></mstyle></math></mjx-assistive-mml></mjx-container>, respectively. In particular, we study two variants of AdaGrad with random shuffling for finite sum minimization. Our analysis suggests that the combination of random shuffling and adaptive learning rates gives rise to better convergence.",
+    "title": "Adaptive Gradient Methods Can Be Provably Faster than SGD with Random Shuffling",
+    "authors": [
+      "Xunpeng Huang",
+      "Vicky Jiaqi Zhang",
+      "Hao Zhou",
+      "Lei Li"
+    ],
+    "emails": [
+      "bytedance.com",
+      "~Xunpeng_Huang1",
+      "~Lei_Li11",
+      "~Vicky_Jiaqi_Zhang2"
+    ],
+    "rank": 2274
+  },
+  {
+    "url": "https://openreview.net/forum?id=zCu1BZYCueE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Hyper-parameter optimization (HPO) is critical in training high performing Deep Neural Networks (DNN). Current methodologies fail to define an analytical response surface and remain a training bottleneck due to their use of additional internal hyper-parameters and lengthy evaluation cycles. We demonstrate that the low-rank factorization of the convolution weights of intermediate layers of a CNN can define an analytical response surface. We quantify how this surface acts as an auxiliary to optimizing training metrics. We introduce a dynamic tracking algorithm -- autoHyper -- that performs HPO on the order of hours for various datasets including ImageNet and requires no manual tuning. Our method -- using a single RTX2080Ti -- is able to select a learning rate within 59 hours for AdaM on ResNet34 applied to ImageNet and improves in testing accuracy by 4.93% over the default learning rate. In contrast to previous methods, we empirically prove that our algorithm and response surface generalize well across model, optimizer, and dataset selection removing the need for extensive domain knowledge to achieve high levels of performance.",
+    "title": "Response Modeling of Hyper-Parameters for Deep Convolutional Neural Networks",
+    "authors": [
+      "Mathieu Tuli",
+      "Mahdi S. Hosseini",
+      "Konstantinos N Plataniotis"
+    ],
+    "emails": [
+      "~Mahdi_S._Hosseini1",
+      "~Konstantinos_N_Plataniotis1",
+      "cs.toronto.edu"
+    ],
+    "rank": 2275
+  },
+  {
+    "url": "https://openreview.net/forum?id=GtCq61UFDId",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The ubiquity and rate of collection of cardiac signals produce large, unlabelled datasets. Active learning (AL) can exploit such datasets by incorporating human annotators (oracles) to improve generalization performance. However, the over-reliance of existing algorithms on oracles continues to burden physicians. To minimize this burden, we propose SoCal, a consistency-based AL framework that dynamically determines whether to request a label from an oracle or to generate a pseudo-label instead. We show that our framework decreases the labelling burden while maintaining strong performance, even in the presence of a noisy oracle.",
+    "title": "SoCal: Selective Oracle Questioning for Consistency-based Active Learning of Cardiac Signals",
+    "authors": [
+      "Dani Kiyasseh",
+      "Tingting Zhu",
+      "David A. Clifton"
+    ],
+    "emails": [
+      "~Dani_Kiyasseh1",
+      "~David_A._Clifton1",
+      "eng.ox.ac.uk"
+    ],
+    "rank": 2276
+  },
+  {
+    "url": "https://openreview.net/forum?id=dzZaIeG9-fW",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Source code is notably different from natural language in that it is meant to be executed. Experienced developers infer complex \"invariants\" about run-time state while reading code, which helps them to constrain and predict program behavior. Knowing these invariants can be helpful; yet developers rarely encode these explicitly, so machine-learning methods don't have much aligned data to learn from. We propose an approach that adapts cues within existing if-statements  regarding explicit run-time expectations to generate aligned datasets of code and implicit invariants. We also propose a contrastive loss to inhibit generation of illogical invariants. Our model learns to infer a wide vocabulary of invariants for arbitrary code, which can be used to detect and repair real bugs. This is entirely complementary to established approaches, which either use logical engines that scale poorly, or run-time traces that are expensive to obtain; when present, that data can complement our tool, as we demonstrate in conjunction with Daikon, an existing tool. Our results show that neural models can derive useful representations of run-time behavior directly from source code.",
+    "title": "Learning to Infer Run-Time Invariants from Source code",
+    "authors": [
+      "Vincent Josua Hellendoorn",
+      "Premkumar Devanbu",
+      "Alex Polozov",
+      "Mark Marron"
+    ],
+    "emails": [
+      "~Premkumar_Devanbu1",
+      "microsoft.com",
+      "~Vincent_Josua_Hellendoorn1",
+      "~Alex_Polozov1"
+    ],
+    "rank": 2277
+  },
+  {
+    "url": "https://openreview.net/forum?id=NL40q_yuavv",
+    "ratings": [
+      6,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We propose a novel 3d shape representation for 3d shape reconstruction from a single image. Rather than predicting a shape directly, we train a network to generate a training set which will be feed into another learning algorithm to define the shape.\nTraining data generating networks establish a link between few-shot learning and 3d shape analysis. We propose a novel meta-learning framework to jointly train the data generating network and other components. We improve upon recent work on standard benchmarks for 3d shape reconstruction, but our novel shape representation has many applications.",
+    "title": "Training Data Generating Networks: Linking 3D Shapes and Few-Shot Classification",
+    "authors": [
+      "Biao Zhang",
+      "Peter Wonka"
+    ],
+    "emails": [
+      "~Peter_Wonka1",
+      "~Biao_Zhang5"
+    ],
+    "rank": 2278
+  },
+  {
+    "url": "https://openreview.net/forum?id=oKWmzgO7bfl",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Deep learning models owe their success at large, to the availability of a large amount of annotated data. They try to extract features from the data that contain useful information needed to improve their performance on target applications. Most works focus on directly optimizing the target loss functions to improve the accuracy by allowing the model to implicitly learn representations from the data. There has not been much work on using background/noise data to estimate the statistics of in-domain data to improve the feature representation of deep neural networks. In this paper, we probe this direction by deriving a relationship between the estimation of unknown parameters of the probability density function (pdf) of input data and classification accuracy. Using this relationship, we show that having a better estimate of the unknown parameters using background and in-domain data provides better features which leads to better accuracy. Based on this result, we introduce a simple but effective detection booster training (DBT) method that applies a detection loss function on the early layers of a neural network to discriminate in-domain data points from noise/background data, to improve the classifier accuracy. The background/noise data comes from the same family of pdfs of input data but with different parameter sets (e.g., mean, variance). In addition, we also show that our proposed DBT method improves the accuracy even with limited labeled in-domain training samples as compared to normal training. We conduct experiments on face recognition, image classification, and speaker classification problems and show that our method achieves superior performance over strong baselines across various datasets and model architectures.",
+    "title": "Detection Booster Training: A detection booster training method for improving the accuracy of classifiers.",
+    "authors": [
+      "Ali Ghobadzadeh",
+      "Deepak Sridhar",
+      "Juwei Lu",
+      "Wei Li"
+    ],
+    "emails": [
+      "~Deepak_Sridhar1",
+      "~Juwei_Lu2",
+      "~Ali_Ghobadzadeh1",
+      "~Wei_Li32"
+    ],
+    "rank": 2279
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y0MgRifqikY",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep reinforcement learning (DRL) has great potential for acquiring the optimal action in complex environments such as games and robot control. However, it is difficult to analyze the decision-making of the agent, i.e., the reasons it selects the action acquired by learning. In this work, we propose Mask-Attention A3C (Mask A3C) that introduced an attention mechanism into Asynchronous Advantage Actor-Critic (A3C) which is an actor-critic-based DRL method, and can analyze decision making of agent in DRL. A3C consists of a feature extractor that extracts features from an image, a policy branch that outputs the policy, value branch that outputs the state value. In our method, we focus on the policy branch and value branch and introduce an attention mechanism to each. In the attention mechanism, mask processing is performed on the feature maps of each branch using mask-attention that expresses the judgment reason for the policy and state value with a heat map. We visualized mask-attention maps for games on the Atari 2600 and found we could easily analyze the reasons behind an agent\u2019s decision-making in various game tasks. Furthermore, experimental results showed that higher performance of the agent could be achieved by introducing the attention mechanism.\n",
+    "title": "Visual Explanation using Attention Mechanism in Actor-Critic-based Deep Reinforcement Learning",
+    "authors": [
+      "Hidenori Itaya",
+      "Tsubasa Hirakawa",
+      "Takayoshi Yamashita",
+      "Hironobu Fujiyoshi",
+      "Komei Sugiura"
+    ],
+    "emails": [
+      "~Tsubasa_Hirakawa1",
+      "~Hironobu_Fujiyoshi2",
+      "~Komei_Sugiura1",
+      "~Hidenori_Itaya1",
+      "~Takayoshi_Yamashita1"
+    ],
+    "rank": 2280
+  },
+  {
+    "url": "https://openreview.net/forum?id=0n3BaVlNsHI",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Convolutional Neural Networks (CNNs) are vulnerable to unseen noise on input images at the test time, and thus improving the robustness is crucial. In this paper, we propose DJMix, a data augmentation method to improve the robustness by mixing each training image and its discretized one. Discretization is done in an unsupervised manner by an autoencoder, and the mixed images are nearly impossible to distinguish from the original images. Therefore, DJMix can easily be adapted to various image recognition tasks. We verify the effectiveness of our method using classification, semantic segmentation, and detection using clean and noisy test images.",
+    "title": "DJMix: Unsupervised Task-agnostic Augmentation for Improving Robustness",
+    "authors": [
+      "Ryuichiro Hataya",
+      "Hideki Nakayama"
+    ],
+    "emails": [
+      "~Ryuichiro_Hataya1",
+      "~Hideki_Nakayama1"
+    ],
+    "rank": 2281
+  },
+  {
+    "url": "https://openreview.net/forum?id=EBRTjOm_sl1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      7,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Supervised learning models perform best when trained on a lot of data, but annotating training data is very costly in some domains. Active learning aims to chose only the most informative subset of unlabelled samples for annotation, thus saving annotation cost. Several heuristics for choosing this subset have been developed, which use fix policies for this choice. They are easily understandable and applied. However, there is no heuristic performing optimal in all settings. This lead to the development of agents learning the best selection policy from data. They formulate active learning as a Markov decision process and applying reinforcement learning (RL) methods to it. Their advantage is that they are able to use many features and to adapt to the specific task.\n\nOur paper proposes a new approach combining these advantages of learning active learning and heuristics: We propose to learn active learning using a parametrised ensemble of agents, where the parameters are learned using Monte Carlo policy search. As this approach can incorporate any active learning agent into its ensemble, it allows to increase the performance of every active learning agent by learning how to combine it with others.",
+    "title": "Learning Active Learning in the Batch-Mode Setup with Ensembles of Active Learning Agents",
+    "authors": [
+      "Malte Ebner",
+      "Bernhard Kratzwald",
+      "Stefan Feuerriegel"
+    ],
+    "emails": [
+      "ethz.ch",
+      "~Malte_Ebner1"
+    ],
+    "rank": 2282
+  },
+  {
+    "url": "https://openreview.net/forum?id=OtAnbr1OQAW",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "In this paper, we study the problem of autonomously discovering temporally abstracted actions, or options, for exploration in reinforcement learning. For learning diverse options suitable for exploration, we introduce the infomax termination objective defined as the mutual information between options and their corresponding state transitions. We derive a scalable optimization scheme for maximizing this objective via the termination condition of options, yielding the InfoMax Option Critic (IMOC) algorithm. Through illustrative experiments, we empirically show that IMOC learns diverse options and utilizes them for exploration. Moreover, we show that IMOC scales well to continuous control tasks.\n",
+    "title": "Diverse Exploration via InfoMax Options",
+    "authors": [
+      "Yuji Kanagawa",
+      "Tomoyuki Kaneko"
+    ],
+    "emails": [
+      "~Yuji_Kanagawa1",
+      "~Tomoyuki_Kaneko1"
+    ],
+    "rank": 2283
+  },
+  {
+    "url": "https://openreview.net/forum?id=drEe_dOHE_",
+    "ratings": [
+      3,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Generative adversarial networks have shown their ability in capturing high-dimensional complex distributions and generating realistic data samples e.g. images. However, existing models still have difficulty in handling multi-modal outputs, which are often susceptible to mode collapse in the sense that the generator can only map latent variables to a part of modes of the target distribution. In this paper, we analyze the typical cases of mode collapse and define the concept of mode completeness in the view of probability measure. We further prove that the inverse mapping can play an effective role to mitigate mode collapse. Under this framework, we  further adopt the multi-dimensional Gaussian loss instead of one-dimensional one that has been widely used in existing work, to generate diverse images. Our experiments on synthetic data as well as real-word images show the superiority of our model. Source code will be released with the final paper.",
+    "title": "InvertGAN: Reducing mode collapse with multi-dimensional Gaussian Inversion",
+    "authors": [
+      "Liangliang Shi",
+      "Yang Li",
+      "Junchi Yan"
+    ],
+    "emails": [
+      "~Yang_Li32",
+      "~Liangliang_Shi1",
+      "~Junchi_Yan2"
+    ],
+    "rank": 2284
+  },
+  {
+    "url": "https://openreview.net/forum?id=dKwmCtp6YI",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Inspired by the phenomenon of performance disparity between languages in machine translation, we investigate whether and to what extent languages are equally hard to \"conditional-language-model\". Our goal is to improve our understanding and expectation of the relationship between language, data representation, size, and performance. We study one-to-one, bilingual conditional language modeling through a series of systematically controlled experiments with the Transformer and the 6 languages from the United Nations Parallel Corpus. We examine character, byte, and word models in 30 language directions and 5 data sizes, and observe indications suggesting a script bias on the character level, a length bias on the byte level, and a word bias that gives rise to a hierarchy in performance across languages. We also identify two types of sample-wise non-monotonicity --- while word-based representations are prone to exhibit Double Descent, length can induce unstable performance across the size range studied in a novel meta phenomenon which we term \"erraticity\". By eliminating statistically significant performance disparity on the character and byte levels by normalizing length and vocabulary in the data, we show that, in the context of computing with the Transformer, there is no complexity intrinsic to languages other than that related to their statistical attributes and that performance disparity is not a necessary condition but a byproduct of word segmentation. Our application of statistical comparisons as a fairness measure also serves as a novel rigorous method for the intrinsic evaluation of languages, resolving a decades-long debate on language complexity. While these quantitative biases leading to disparity are mitigable through a shallower network, we find room for a human bias to be reflected upon. We hope our work helps open up new directions in the area of language and computing that would be fairer and more flexible and foster a new transdisciplinary perspective for DL-inspired scientific progress.",
+    "title": "Representation and Bias in Multilingual NLP: Insights from Controlled Experiments on Conditional Language Modeling",
+    "authors": [
+      "Ada Wan"
+    ],
+    "emails": [
+      "~Ada_Wan1"
+    ],
+    "rank": 2285
+  },
+  {
+    "url": "https://openreview.net/forum?id=ba82GniSJdc",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "As numerous meta-learning algorithms improve performance when solving few-shot classification problems for practical applications, accurate prediction of uncertainty, though challenging, has been considered essential. In this study, we contemplate modeling uncertainty in a few-shot classification framework and propose a straightforward method that appropriately predicts task uncertainty. We suppose that the random sampling of tasks can generate those in which it may be hard for the model to infer the queries from the support examples. Specifically, measuring the distributional mismatch between support and query sets via class-wise similarities, we propose novel meta-training that lets the model predict with careful confidence. Moreover, our method is algorithm-agnostic and readily expanded to include a range of meta-learning models. Through extensive experiments including dataset shift, we present that our training strategy helps the model avoid being indiscriminately confident, and thereby, produce calibrated classification results without the loss of accuracy.",
+    "title": "Task Calibration for Distributional Uncertainty in Few-Shot Classification",
+    "authors": [
+      "Sungnyun Kim",
+      "Se-Young Yun"
+    ],
+    "emails": [
+      "~Se-Young_Yun1",
+      "~Sungnyun_Kim1"
+    ],
+    "rank": 2286
+  },
+  {
+    "url": "https://openreview.net/forum?id=XqQQlvHvtI",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Exploration is a long-standing challenge in sequential decision problem in machine learning. This paper investigates the adoption of two theories of optimal stimulation level - \"the pacer principle\" and the Wundt curve - from psychology to improve the exploration challenges. We propose a method called exploration with pleasure (EP) which is formulated based on the notion of pleasure as defined in accordance with the above two theories. EP is able to identify the region of stimulations that will trigger pleasure to the learning agent during exploration and consequently improve on the learning process. The effectiveness of EP is studied in two machine learning settings: curiosity-driven reinforcement learning (RL) and Bayesian optimisation (BO). Experiments in purely curiosity-driven RL show that by using EP to generate intrinsic rewards, it can yield faster learning. Experiments in BO demonstrate that by using EP to specify the exploration parameters in two acquisition functions - Probability of Improvement and Expected Improvement - it can achieve faster convergence and better function values.\n",
+    "title": "Learning to Explore with Pleasure",
+    "authors": [
+      "Yean Hoon Ong",
+      "Jun Wang"
+    ],
+    "emails": [
+      "~Jun_Wang2",
+      "~Yean_Hoon_Ong1"
+    ],
+    "rank": 2287
+  },
+  {
+    "url": "https://openreview.net/forum?id=E9W0QPxtZ_u",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      6,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "State-of-the-art models for high-resolution image generation, such as BigGAN and VQVAE-2, require an incredible amount of compute resources and/or time (512 TPU-v3 cores) to train, putting them out of reach for the larger research community. On the other hand, GAN-based image super-resolution models, such as ESRGAN, can not only upscale images to high dimensions, but also are efficient to train. In this paper, we present not-so-big-GAN (nsb-GAN), a simple yet cost-effective two-step training framework for deep generative models (DGMs) of high-dimensional natural images. First, we generate images in low-frequency bands by training a sampler in the wavelet domain. Then, we super-resolve these images from the wavelet domain back to the pixel-space with our novel wavelet super-resolution decoder network. Wavelet-based down-sampling method preserves more structural information than pixel-based methods, leading to significantly better generative quality of the low-resolution sampler (e.g., 64\u00d764). Since the sampler and decoder can be trained in parallel and operate on much lower dimensional spaces than end-to-end models, the training cost is substantially reduced. On ImageNet 512\u00d7512, our model achieves a Fr\u00e9chet Inception Distance (FID) of 10.59 \u2013 beating the baseline BigGAN model \u2013 at half the compute (256 TPU-v3 cores).",
+    "title": "not-so-big-GAN: Generating High-Fidelity Images on Small Compute with Wavelet-based Super-Resolution",
+    "authors": [
+      "Seungwook Han",
+      "Akash Srivastava",
+      "Cole Lincoln Hurwitz",
+      "Prasanna Sattigeri",
+      "David Daniel Cox"
+    ],
+    "emails": [
+      "~David_Daniel_Cox1",
+      "~Akash_Srivastava1",
+      "~Cole_Lincoln_Hurwitz1",
+      "~Seungwook_Han1",
+      "~Prasanna_Sattigeri1"
+    ],
+    "rank": 2288
+  },
+  {
+    "url": "https://openreview.net/forum?id=pBDwTjmdDo",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Static graph representation learning has been applied in many tasks over the years thanks to the invention of unsupervised graph embedding methods and more recently, graph neural networks (GNNs). However, in many cases, we are to handle dynamic graphs where the structures of graphs and labels of the nodes are evolving steadily with time. This has posed a great challenge to existing methods in time and memory efficiency. In this work, we present a new method named Fourier Temporal State Embedding (FTSE) to address the temporal information in dynamic graph representation learning. FTSE offered time and memory-efficient solution through applying signal processing techniques to the temporal graph signals. We paired the Fourier Transform with an efficient edge network and provided a new prototype of modeling dynamic graph evolution with high precision. FTSE can also prevent the 'history explosion' that exists in sequential models. The empirical study shows that our proposed approach achieves significantly better performance than previous approaches on public datasets across multiple tasks.",
+    "title": "Dynamic Graph Representation Learning with Fourier Temporal State Embedding",
+    "authors": [
+      "Yihan He",
+      "Wei Cao",
+      "Shun Zheng",
+      "Zhifeng Gao",
+      "Jiang Bian"
+    ],
+    "emails": [
+      "~Jiang_Bian1",
+      "~Zhifeng_Gao1",
+      "~Yihan_He1",
+      "~Wei_Cao1",
+      "~Shun_Zheng1"
+    ],
+    "rank": 2289
+  },
+  {
+    "url": "https://openreview.net/forum?id=SO73JUgks8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "How can we effectively regularize BERT? Although BERT proves its effectiveness in various downstream natural language processing tasks, it often over\ufb01ts when there are only a small number of training instances. A promising direction to regularize BERT is based on pruning its attention heads based on a proxy score for head importance. However, heuristic-based methods are usually suboptimal since they predetermine the order by which attention heads are pruned. In order to overcome such a limitation, we propose AUBER, an effective regularization method that leverages reinforcement learning to automatically prune attention heads from BERT. Instead of depending on heuristics or rule-based policies, AUBER learns a pruning policy that determines which attention heads should or should not be pruned for regularization. Experimental results show that AUBER outperforms existing pruning methods by achieving up to 10% better accuracy. In addition, our ablation study empirically demonstrates the effectiveness of our design choices for AUBER.",
+    "title": "AUBER: Automated BERT Regularization",
+    "authors": [
+      "Hyun Dong Lee",
+      "Seongmin Lee",
+      "U Kang"
+    ],
+    "emails": [
+      "~Hyun_Dong_Lee1",
+      "~Seongmin_Lee2",
+      "~U_Kang1"
+    ],
+    "rank": 2290
+  },
+  {
+    "url": "https://openreview.net/forum?id=AcH9xD24Hd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We consider the problem of how to learn a step-size policy for the Limited-Memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) algorithm. This is a limited computational memory quasi-Newton method widely used for deterministic unconstrained optimization but currently avoided in large-scale problems for requiring step sizes to be provided at each iteration. Existing methodologies for the step size selection for L-BFGS use heuristic tuning of design parameters and massive re-evaluations of the objective function and gradient to find appropriate step-lengths. We propose a neural network architecture with local information of the current iterate as the input. The step-length policy is learned from data of similar optimization problems, avoids additional evaluations of the objective function, and guarantees that the output step remains inside a pre-defined interval. The corresponding training procedure is formulated as a stochastic optimization problem using the backpropagation through time algorithm.  The performance of the proposed method is evaluated on the training of classifiers for the MNIST database for handwritten digits and for CIFAR-10. The results show that the proposed algorithm outperforms heuristically tuned optimizers such as ADAM, RMSprop,  L-BFGS with a backtracking line search and L-BFGS with a constant step size. The numerical results also show that a learned policy can be used as a warm-start to train new policies for different problems after a few additional training steps, highlighting its potential use in multiple large-scale optimization problems.",
+    "title": "Learning the Step-size Policy for the Limited-Memory Broyden-Fletcher-Goldfarb-Shanno Algorithm",
+    "authors": [
+      "Lucas N. Egidio",
+      "Anders Hansson",
+      "Bo Wahlberg"
+    ],
+    "emails": [
+      "liu.se",
+      "~Bo_Wahlberg1",
+      "~Lucas_N._Egidio1"
+    ],
+    "rank": 2291
+  },
+  {
+    "url": "https://openreview.net/forum?id=OkXODFHhfum",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "One of the long-term goals of machine learning is to develop models which will generalize well enough that they can handle a broader range of circumstances than what they were trained on. In the context of vision, such a model would not get confounded by elements outside of its training distribution. To this end, we propose a new task for training neural networks, in which the goal is to determine if all images in a given set share the same class. We demonstrate that a model trained with this task can classify and cluster samples from out-of-distribution classes. This includes left out classes from the same dataset, as well as entire datasets never trained on. Our experiments also reveal an unreported phenomenon, which is that neural networks can overfit their training classes, leading to poorer out-of-distribution performance. It is our belief that mitigating this effect and improving on our task will lead to better out-of-distribution generalization as well as behaviours more resembling those of humans.",
+    "title": "Out-of-Distribution Classification and Clustering",
+    "authors": [
+      "Gabriele Prato",
+      "Sarath Chandar"
+    ],
+    "emails": [
+      "~Sarath_Chandar1",
+      "~Gabriele_Prato1"
+    ],
+    "rank": 2292
+  },
+  {
+    "url": "https://openreview.net/forum?id=LDSeViRs4-Q",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      1
+    ],
+    "abstract": "Deep neural networks (DNNs), including convolutional neural networks, are known to be vulnerable to adversarial attacks, which may lead to disastrous consequences in life-critical applications. Adversarial samples are usually generated by attack algorithms and can also be induced by white noises, and therefore the threats are real. In this study, we propose a novel training method, named Increasing Margin Adversarial (IMA) Training, to improve DNN robustness against adversarial noises. During training, the IMA method increases the margins of training samples by moving the decision boundaries of the DNN model far away from the training samples to improve robustness. The IMA method is evaluated on six publicly available datasets (including a COVID-19 CT image dataset) under strong 100-PGD white-box adversarial attacks, and the results show that the proposed method significantly improved classification accuracy on noisy data while keeping a relatively high accuracy on clean data. We hope our approach may facilitate the development of robust DNN applications, especially for COVID-19 diagnosis using CT images.",
+    "title": "Increasing-Margin Adversarial (IMA) training to Improve Adversarial Robustness of Neural Networks",
+    "authors": [
+      "Linhai Ma",
+      "Liang Liang"
+    ],
+    "emails": [
+      "~Liang_Liang2",
+      "~Linhai_Ma1"
+    ],
+    "rank": 2293
+  },
+  {
+    "url": "https://openreview.net/forum?id=xQnvyc6r3LL",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      7
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Locating the source of an epidemic, or patient zero (P0), can provide critical insights into  the infection's transmission course and allow efficient resource allocation. \nExisting methods use graph-theoretic centrality measures and expensive message-passing algorithms, requiring knowledge of the underlying dynamics and its parameters.\nIn this paper, we revisit this problem using graph neural networks (GNNs) to learn P0. \nWe observe that GNNs can identify P0 close to the theoretical bound on accuracy, without explicit input of dynamics or its parameters.\nIn addition, GNN is over 100 times faster than classic methods for inference on arbitrary graph topologies.\nOur theoretical bound also shows that the epidemic is like a ticking clock, emphasizing the importance of early contact-tracing. \nWe find a maximum time after which accurate recovery of the source becomes impossible, regardless of the algorithm used.",
+    "title": "Finding Patient Zero: Learning Contagion Source with Graph Neural Networks",
+    "authors": [
+      "Chintan Shah",
+      "Nima Dehmamy",
+      "Nicola Perra",
+      "Matteo Chinazzi",
+      "Albert-Laszlo Barabasi",
+      "Alessandro Vespignani",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Rose_Yu1",
+      "~Albert-Laszlo_Barabasi1",
+      "gmail.com",
+      "~Matteo_Chinazzi1",
+      "~Chintan_Shah2",
+      "~Alessandro_Vespignani1",
+      "~Nima_Dehmamy1"
+    ],
+    "rank": 2294
+  },
+  {
+    "url": "https://openreview.net/forum?id=zgMPc_48Zb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      2,
+      3,
+      3
+    ],
+    "abstract": "Although machine learning models trained on massive data have led to breakthroughs in several areas, their deployment in privacy-sensitive domains remains limited due to restricted access to data. Generative models trained with privacy constraints on private data can sidestep this challenge and provide indirect access to the private data instead. We propose DP-Sinkhorn, a novel optimal transport-based generative method for learning data distributions from private data with differential privacy. DP-Sinkhorn relies on minimizing the Sinkhorn divergence---a computationally efficient approximation to the exact optimal transport distance---between the model and the data in a differentially private manner and also uses a novel technique for conditional generation in the Sinkhorn framework. Unlike existing approaches for training differentially private generative models, which are mostly based on generative adversarial networks, we do not rely on adversarial objectives, which are notoriously difficult to optimize, especially in the presence of noise imposed by the privacy constraints. Hence, DP-Sinkhorn is easy to train and deploy. Experimentally, despite our method's simplicity we improve upon the state-of-the-art on multiple image modeling benchmarks. We also show differentially private synthesis of informative RGB images, which has not been demonstrated before by differentially private generative models without the use of auxiliary public data.",
+    "title": "Differentially Private Generative Models Through Optimal Transport",
+    "authors": [
+      "Tianshi Cao",
+      "Alex Bie",
+      "Karsten Kreis",
+      "Sanja Fidler"
+    ],
+    "emails": [
+      "~Karsten_Kreis1",
+      "~Sanja_Fidler1",
+      "~Tianshi_Cao1",
+      "nvidia.com"
+    ],
+    "rank": 2295
+  },
+  {
+    "url": "https://openreview.net/forum?id=6_FjMpi_ebO",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Neural image classification models typically consist of two components. The first is an image encoder, which is responsible for encoding a given raw image into a representative vector. The second is the classification component, which is often implemented by projecting the representative vector onto target class vectors. The target class vectors, along with the rest of the model parameters, are estimated so as to minimize the loss function. \n\nIn this paper, we analyze how simple design choices for the classification layer affect the learning dynamics. We show that the standard cross-entropy training implicitly captures visual similarities between different classes, which might deteriorate accuracy or even prevents some models from converging. We propose to draw the class vectors randomly and set them as fixed during training, thus invalidating the visual similarities encoded in these vectors. We analyze the effects of keeping the class vectors fixed and show that it can increase the inter-class separability, intra-class compactness, and the overall model accuracy, while maintaining the robustness to image corruptions and the generalization of the learned concepts.",
+    "title": "Redesigning the Classification Layer by Randomizing the Class Representation Vectors",
+    "authors": [
+      "Gabi Shalev",
+      "Gal Lev Shalev",
+      "Yossi Keshet"
+    ],
+    "emails": [
+      "~Gabi_Shalev1",
+      "~Yossi_Keshet1",
+      "~Gal_Lev_Shalev1"
+    ],
+    "rank": 2296
+  },
+  {
+    "url": "https://openreview.net/forum?id=uOjm_xqKEoX",
+    "ratings": [
+      2,
+      6,
+      4,
+      6,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      5,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In practice, due to the need for online interaction with the environment, deploying reinforcement learning (RL) agents in safety-critical scenarios is challenging. Offline RL offers the promise of directly leveraging large, previously collected datasets to acquire effective policies without further interaction. Although a number of offline RL algorithms have been proposed, their performance is generally limited by the quality of dataset. To address this problem, we propose an Adaptive Policy constrainT (AdaPT) method, which allows effective exploration on out-of-distribution actions by imposing an adaptive constraint on the learned policy. We theoretically show that AdaPT produces a tight upper bound on the distributional deviation between the learned policy and the behavior policy, and this upper bound is the minimum requirement to guarantee policy improvement at each iteration. Formally, we present a practical AdaPT augmented Actor-Critic (AdaPT-AC) algorithm, which is able to learn a generalizable policy even from the dataset that contains a large amount of random data and induces a bad behavior policy. The empirical results on a range of continuous control benchmark tasks demonstrate that AdaPT-AC substantially outperforms several popular algorithms in terms of both final performance and robustness on four datasets of different qualities.",
+    "title": "Robust Offline Reinforcement Learning from Low-Quality Data",
+    "authors": [
+      "Wenjie Shi",
+      "Tianchi Cai",
+      "Shiji Song",
+      "Lihong Gu",
+      "Jinjie Gu",
+      "Gao Huang"
+    ],
+    "emails": [
+      "~Wenjie_Shi1",
+      "antgroup.com",
+      "~Tianchi_Cai1",
+      "~Gao_Huang1",
+      "~Shiji_Song1"
+    ],
+    "rank": 2297
+  },
+  {
+    "url": "https://openreview.net/forum?id=mZLhA0xFGmR",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Canonical Correlation Analysis (CCA) models can extract informative correlated representations from multimodal unlabelled data. Despite their success, CCA models may break if the number of variables exceeds the number of samples. We propose Deep Gated-CCA, a method for learning correlated representations based on a sparse subset of variables from two observed modalities. The proposed procedure learns two non-linear transformations and simultaneously gates the input variables to identify a subset of most correlated variables. The non-linear transformations are learned by training two neural networks to maximize a shared correlation loss defined based on their outputs. Gating is obtained by adding an approximate <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> regularization term applied to the input variables. This approximation relies on a recently proposed continuous Gaussian based relaxation for Bernoulli variables which act as gates. We demonstrate the efficacy of the method using several synthetic and real examples. Most notably, the method outperforms other linear and non-linear CCA models.",
+    "title": "Deep Gated Canonical Correlation Analysis",
+    "authors": [
+      "Ofir Lindenbaum",
+      "Moshe Salhov",
+      "Amir Averbuch",
+      "Yuval Kluger"
+    ],
+    "emails": [
+      "~Amir_Averbuch1",
+      "013.net",
+      "~Yuval_Kluger1",
+      "~Ofir_Lindenbaum1"
+    ],
+    "rank": 2298
+  },
+  {
+    "url": "https://openreview.net/forum?id=PXDdWQDBsCG",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Humans rely heavily on shape information to recognize objects. Conversely, convolutional\nneural networks (CNNs) are biased more towards texture. This fact\nis perhaps the main reason why CNNs are susceptible to adversarial examples.\nHere, we explore how shape bias can be incorporated into CNNs to improve their\nrobustness. Two algorithms are proposed, based on the observation that edges are\ninvariant to moderate imperceptible perturbations. In the first one, a classifier is\nadversarially trained on images with the edge map as an additional channel. At\ninference time, the edge map is recomputed and concatenated to the image. In the\nsecond algorithm, a conditional GAN is trained to translate the edge maps, from\nclean and/or perturbed images, into clean images. The inference is done over the\ngenerated image corresponding to the input\u2019s edge map. A large number of experiments\nwith more than 10 data sets have proved the effectiveness of the proposed\nalgorithms against FGSM and `1 PGD-40 attacks. against FGSM and `<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> PGD-40 attacks. \nFurther, we show that edge information can a) benefit other adversarial training methods, b) be even more effective\nin conjunction with background subtraction, c) be used to defend against poisoning\nattacks, and d) make CNNs more robust against natural image corruptions\nsuch as motion blur, impulse noise, and JPEG compression, than CNNs trained\nsolely on RGB images. From a broader perspective, our study suggests that CNNs\ndo not adequately account for image structures and operations that are crucial for\nrobustness. The code is available at: <a href=\"https://github.com/[masked]\" target=\"_blank\" rel=\"nofollow\">https://github.com/[masked]</a>.",
+    "title": "Shape Defense",
+    "authors": [
+      "ali borji"
+    ],
+    "emails": [
+      "~ali_borji1"
+    ],
+    "rank": 2299
+  },
+  {
+    "url": "https://openreview.net/forum?id=K6YbHUIWHOy",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The expressive power of graph neural networks (GNN) has drawn much interest recently. Most existent work focused on measuring the expressiveness of GNN through the task of distinguishing between graphs. In this paper, we inspect the representation limits of locally unordered messaging passing (LUMP) GNN architecture through the lens of \\emph{node classification}. For GNNs based on permutation invariant local aggregators, we characterize graph-theoretic conditions under which such GNNs fail to discriminate simple instances, regardless of underlying architecture or network depth. To overcome this limitation, we propose a novel framework to augment GNNs with global graph information called \\emph{memory augmentation}. Specifically, we allow every node in the original graph to interact with a group of memory nodes. For each node, information from all the other nodes in the graph can be gleaned through the relay of the memory nodes. For proper backbone architectures like GAT and GCN, memory augmented GNNs are theoretically shown to be more expressive than LUMP GNNs. Empirical evaluations demonstrate the significant improvement of memory augmentation. In particular, memory augmented GAT and GCN are shown to either outperform or closely match state-of-the-art performance across various benchmark datasets. ",
+    "title": "Memory Augmented Design of Graph Neural Networks",
+    "authors": [
+      "Tao Xiong",
+      "Liang Zhu",
+      "Ruofan Wu",
+      "Yuan Qi"
+    ],
+    "emails": [
+      "~Ruofan_Wu1",
+      "antgroup.com",
+      "~Tao_Xiong3"
+    ],
+    "rank": 2300
+  },
+  {
+    "url": "https://openreview.net/forum?id=BnokSKnhC7F",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      6,
+      4
+    ],
+    "rating": "4.50",
+    "confidences": [
+      3,
+      4,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Reinforcement learning (RL) algorithms typically deal with maximizing the expected cumulative return (discounted or undiscounted, finite or infinite horizon). However, several crucial applications in the real world, such as drug discovery, do not fit within this framework because an RL agent only needs to identify states (molecules) that achieve the highest reward within a trajectory and does not need to optimize for the expected cumulative return. In this work, we formulate an objective function to maximize the expected maximum reward along a trajectory, derive a novel functional form of the Bellman equation, introduce the corresponding Bellman operators, and provide a proof of convergence. Using this formulation, we achieve state-of-the-art results on the task of molecule generation that mimics a real-world drug discovery pipeline.",
+    "title": "Maximum Reward Formulation In Reinforcement Learning",
+    "authors": [
+      "SaiKrishna Gottipati",
+      "Yashaswi Pathak",
+      "Rohan Nuttall",
+      ". Sahir",
+      "Raviteja Chunduru",
+      "Ahmed Touati",
+      "Sriram Ganapathi Subramanian",
+      "Matthew E. Taylor",
+      "Sarath Chandar"
+    ],
+    "emails": [
+      "~SaiKrishna_Gottipati1",
+      "~Matthew_E._Taylor2",
+      "~._Sahir1",
+      "~Ahmed_Touati1",
+      "~Yashaswi_Pathak1",
+      "ualberta.ca",
+      "~Sarath_Chandar1",
+      "~Sriram_Ganapathi_Subramanian1",
+      "~Raviteja_Chunduru1"
+    ],
+    "rank": 2301
+  },
+  {
+    "url": "https://openreview.net/forum?id=CaCHjsqCBJV",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      5,
+      2,
+      5
+    ],
+    "abstract": "We propose a framework which makes it feasible to directly train deep neural networks with respect to popular families of task-specific non-decomposable per- formance measures such as AUC, multi-class AUC, F -measure and others, as well as models such as non-negative matrix factorization. A common feature of the optimization model that emerges from these tasks is that it involves solving a Linear Programs (LP) during training where representations learned by upstream layers influence the constraints. The constraint matrix is not only large but the constraints are also modified at each iteration. We show how adopting a set of influential ideas proposed by Mangasarian for 1-norm SVMs \u2013 which advocates for solving LPs with a generalized Newton method \u2013 provides a simple and effective solution. In particular, this strategy needs little unrolling, which makes it more efficient during backward pass. While a number of specialized algorithms have been proposed for the models that we de- scribe here, our module turns out to be applicable without any specific adjustments or relaxations. We describe each use case, study its properties and demonstrate the efficacy of the approach over alternatives which use surrogate lower bounds and often, specialized optimization schemes. Frequently, we achieve superior computational behavior and performance improvements on common datasets used in the literature.\n",
+    "title": "Differentiable Optimization of Generalized Nondecomposable Functions using Linear Programs",
+    "authors": [
+      "Zihang Meng",
+      "Lopamudra Mukherjee",
+      "Vikas Singh",
+      "Sathya N. Ravi"
+    ],
+    "emails": [
+      "~Lopamudra_Mukherjee1",
+      "~Zihang_Meng1",
+      "~Sathya_N._Ravi1",
+      "~Vikas_Singh1"
+    ],
+    "rank": 2302
+  },
+  {
+    "url": "https://openreview.net/forum?id=Wg2PSpLZiH",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.50",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Vision-language pre-training (VLP) on large-scale image-text pairs has recently witnessed rapid progress for learning cross-modal representations. Existing pre-training methods either directly concatenate image representation and text representation at a feature level as input to a single-stream Transformer, or use a two-stream cross-modal Transformer to align the image-text representation at a high-level semantic space. In real-world image-text data, we observe that it is easy for some of the image-text pairs to align simple semantics on both modalities, while others may be related after higher-level abstraction. Therefore, in this paper, we propose a new pre-training method SemVLP, which jointly aligns both the low-level and high-level semantics between image and text representations. The model is pre-trained iteratively with two prevalent fashions: single-stream pre-training to align low-level semantics and two-stream pre-training to align high-level semantics, by employing a shared Transformer network with a pluggable cross-modal attention module. An extensive set of experiments have been conducted on four well-established vision-language understanding tasks to demonstrate the effectiveness of the proposed SemVLP in aligning cross-modal representations towards different semantic granularities.",
+    "title": "SemVLP: Vision-Language Pre-training by Aligning Semantics at Multiple Levels",
+    "authors": [
+      "Chenliang Li",
+      "Ming Yan",
+      "Haiyang Xu",
+      "Fuli Luo",
+      "Wei Wang",
+      "Bin Bi",
+      "Songfang Huang"
+    ],
+    "emails": [
+      "~Songfang_Huang1",
+      "~Wei_Wang41",
+      "~Fuli_Luo1",
+      "~Bin_Bi1",
+      "~Haiyang_Xu1",
+      "~Chenliang_Li2",
+      "~Ming_Yan2"
+    ],
+    "rank": 2303
+  },
+  {
+    "url": "https://openreview.net/forum?id=SQbBWB0vjFn",
+    "ratings": [
+      5,
+      4,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.48",
+    "confidences": [
+      5,
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "In this paper, we introduce novel lightweight generative adversarial networks, which can effectively capture long-range dependencies in the image generation process, and produce high-quality results with a much simpler architecture. To achieve this, we first introduce a long-range module, allowing the network to dynamically adjust the number of focused sampling pixels and to also augment sampling locations. Thus, it can break the limitation of the fixed geometric structure of the convolution operator, and capture long-range dependencies in both spatial and channel-wise directions. Also, the proposed long-range module can highlight negative relations between pixels, working as a regularization to stabilize training. Furthermore, we propose a new generation strategy through which we introduce metadata into the image generation process to provide basic information about target images, which can stabilize and speed up the training process. Our novel long-range module only introduces few additional parameters and is easily inserted into existing models to capture long-range dependencies. Extensive experiments demonstrate the competitive performance of our method with a lightweight architecture.",
+    "title": "Lightweight Long-Range Generative Adversarial Networks",
+    "authors": [
+      "Bowen Li",
+      "Thomas Lukasiewicz"
+    ],
+    "emails": [
+      "~Thomas_Lukasiewicz2",
+      "~Bowen_Li2"
+    ],
+    "rank": 2304
+  },
+  {
+    "url": "https://openreview.net/forum?id=5B8YAz6W3eX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we introduce Apollo, a quasi-newton method for noncovex stochastic optimization, which dynamically incorporates the curvature of the loss function by approximating the Hessian via a diagonal matrix. Algorithmically, Apollo requires only first-order gradients and updates the approximation of the Hessian diagonally such that it satisfies the weak secant relation. To handle nonconvexity, we replace the Hessian with its absolute value, the computation of which is also efficient under our diagonal approximation, yielding an optimization algorithm with linear complexity for both time and memory. Experimentally, through three tasks on vision and language we show that Apollo achieves significant improvements over other stochastic optimization methods, including SGD and variants of Adam, in term of both convergence speed and generalization performance.",
+    "title": "Apollo: An Adaptive Parameter-wised Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization",
+    "authors": [
+      "Xuezhe Ma"
+    ],
+    "emails": [
+      "~Xuezhe_Ma1"
+    ],
+    "rank": 2305
+  },
+  {
+    "url": "https://openreview.net/forum?id=PeT5p3ocagr",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Gradient-based policy search algorithms (such as PPO, SAC or TD3) in deep reinforcement learning (DRL) have shown successful results on a range of challenging control tasks. However, they often suffer from flat or deceptive gradient problems. As an alternative to policy gradient methods, population-based evolutionary approaches have been applied to DRL. While population-based search algorithms show more robust learning in a broader range of tasks, they are usually inefficient in the use of samples. Recently, reported are a few attempts (such as CEMRL) to combine gradient with a population in searching optimal policy. This kind of hybrid algorithm takes advantage of both camps. In this paper, we propose yet another hybrid algorithm, which more tightly couples policy gradient with the population-based search. More specifically, we use the Cross-Entropy Method (CEM) for population-based search and Twin Delayed Deep Deterministic Policy Gradient (TD3) for policy gradient. In the proposed algorithm called Coupling Policy Gradient with Population-based Search (PGPS), a single TD3 agent, which learns by a gradient from all experiences generated by population, leads a population by providing its critic function Q as a surrogate to select better performing next-generation population from candidates. On the other hand, if the TD3 agent falls behind the CEM population, then the TD3 agent is updated toward the elite member of the CEM population using loss function augmented with the distance between the TD3 and the CEM elite. Experiments in a MuJoCo environment show that PGPS is robust to deceptive gradient and also outperforms the state-of-the-art algorithms.\n",
+    "title": "PGPS : Coupling Policy Gradient with Population-based Search",
+    "authors": [
+      "Namyong Kim",
+      "Hyunsuk Baek",
+      "Hayong Shin"
+    ],
+    "emails": [
+      "~Hayong_Shin1",
+      "kaist.ac.kr",
+      "~Namyong_Kim1"
+    ],
+    "rank": 2306
+  },
+  {
+    "url": "https://openreview.net/forum?id=HQoCa9WODc0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "While only trained to reconstruct training data, autoencoders may produce high-quality reconstructions of inputs that are well outside the training data distribution.  This phenomenon, which we refer to as outlier reconstruction, has a detrimental effect on the use of autoencoders for outlier detection, as an autoencoder will misclassify a clear outlier as being in-distribution.  In this paper, we introduce the Energy-Based Autoencoder (EBAE), an autoencoder that is considerably less susceptible to outlier reconstruction. \nThe core idea of EBAE is to treat the reconstruction error as an energy function of a normalized density and to strictly enforce the normalization constraint. We show that the reconstruction of non-training inputs can be suppressed, and the reconstruction error made highly discriminative to outliers, by enforcing this constraint.  We empirically show that EBAE significantly outperforms both existing autoencoders and other generative models for several out-of-distribution detection tasks.",
+    "title": "Suppressing Outlier Reconstruction in Autoencoders for Out-of-Distribution Detection",
+    "authors": [
+      "Sangwoong Yoon",
+      "Yung-Kyun Noh",
+      "Frank C. Park"
+    ],
+    "emails": [
+      "~Frank_C._Park1",
+      "~Yung-Kyun_Noh1",
+      "~Sangwoong_Yoon1"
+    ],
+    "rank": 2307
+  },
+  {
+    "url": "https://openreview.net/forum?id=_sCOYXNwaI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      2,
+      5,
+      4
+    ],
+    "abstract": "In this paper we propose a spatial transformer network where the spatial transformations are limited to the group of diffeomorphisms. Diffeomorphic transformations are a kind of homeomorphism, which by definition preserve topology, a compelling property in certain applications.\nWe apply this diffemorphic spatial transformer to model the output of a neural network as a topology preserving mapping of a prior shape. By carefully choosing the prior shape we can enforce properties on the output of the network without requiring any changes to the loss function, such as smooth boundaries and a hard constraint on the number of connected components.\nThe diffeomorphic transformer networks outperform their non-diffeomorphic precursors when applied to learn data invariances in classification tasks. On a breast tissue segmentation task, we show that the approach is robust and flexible enough to deform simple artificial priors, such as Gaussian-shaped prior energies, into high-quality predictive probability densities. In addition to desirable topological properties, the segmentation maps have competitive quantitative fidelity compared to those obtained by direct estimation (i.e. plain U-Net).",
+    "title": "Diffeomorphic Template Transformers",
+    "authors": [
+      "Tycho F.A. van der Ouderaa",
+      "Ivana Isgum",
+      "Wouter B. Veldhuis",
+      "Bob D. De Vos",
+      "Pim Moeskops"
+    ],
+    "emails": [
+      "~Ivana_Isgum1",
+      "umcutrecht.nl",
+      "~Pim_Moeskops2",
+      "~Tycho_F.A._van_der_Ouderaa1",
+      "~Bob_D._De_Vos2"
+    ],
+    "rank": 2308
+  },
+  {
+    "url": "https://openreview.net/forum?id=4zr9e5xwZ9Y",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Modern machine learning techniques are successfully being adapted to data modeled as graphs. However, many real-world graphs are typically very large and do not fit in memory, often making the problem of training machine learning models on them intractable. Distributed training has been successfully employed to alleviate memory problems and speed up training in machine learning domains in which the input data is assumed to be independently identical distributed (i.i.d). However, distributing the training of non i.i.d data such as graphs that are used as training inputs in Graph Convolutional Networks (GCNs) causes accuracy problems since information is lost at the graph partitioning boundaries.\n\nIn this paper, we propose a training strategy that mitigates the lost information across multiple partitions of a graph through a subgraph approximation scheme. Our proposed approach augments each sub-graph with a small amount of edge and vertex information that is approximated from all other sub-graphs.  The subgraph approximation approach helps the distributed training system converge at single-machine accuracy, while keeping the memory footprint low and minimizing synchronization overhead between the machines. ",
+    "title": "Distributed Training of Graph Convolutional Networks using Subgraph Approximation",
+    "authors": [
+      "Alexandra Angerd",
+      "Keshav Balasubramanian",
+      "Murali Annavaram"
+    ],
+    "emails": [
+      "~Murali_Annavaram1",
+      "~Alexandra_Angerd1",
+      "usc.edu"
+    ],
+    "rank": 2309
+  },
+  {
+    "url": "https://openreview.net/forum?id=wJj5gUHKhJs",
+    "ratings": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Contrastive learning is an effective method for learning visual representations. In most cases, this involves adding an explicit loss function to encourage similar images to have similar representations, and different images to have different representations. In this paper, we introduce a clever construction for Implicit Contrastive Learning (ImCLR), primarily in the supervised setting: there, the network can implicitly learn to differentiate between similar and dissimilar images. Furthermore, this requires almost no change to existing pipelines, which allows for easy integration and for fair demonstration of effectiveness on a wide range of well-accepted benchmarks. Namely, there is no change to loss, no change to hyperparameters, and no change to general network architecture. \nWe show that ImCLR improves the test error in the supervised setting across a variety of settings, including 3.24% on Tiny ImageNet, 1.30% on CIFAR-100, 0.14% on CIFAR-10, and 2.28% on STL-10. We show that this holds across different number of labeled samples, maintaining approximately a 2% gap in test accuracy down to using only 5\\% of the whole dataset. We further show that gains hold for robustness to common input corruptions and perturbations at varying severities with a 0.72% improvement on CIFAR-100-C, and in the semi-supervised setting with a 2.16% improvement with the standard benchmark <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c3A0\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">\u03a0</mi></math></mjx-assistive-mml></mjx-container>-model. We demonstrate that ImCLR is complementary to existing data augmentation techniques, achieving over 1% improvement on CIFAR-100 by combining ImCLR with CutMix over either baseline, and 2\\% by combining ImCLR with AutoAugment over either baseline. Finally, we perform an ablation study.",
+    "title": "ImCLR: Implicit Contrastive Learning for Image Classification",
+    "authors": [
+      "John Chen",
+      "Samarth Sinha",
+      "Anastasios Kyrillidis"
+    ],
+    "emails": [
+      "~John_Chen3",
+      "~Samarth_Sinha1",
+      "~Anastasios_Kyrillidis2"
+    ],
+    "rank": 2310
+  },
+  {
+    "url": "https://openreview.net/forum?id=25OSRH9H0Gi",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Most of existing deep learning models rely on excessive amounts of labeled training data in order to achieve state-of-the-art results, even though these data can be hard or costly to get in practice. One attractive alternative is to learn with little supervision, commonly referred to as few-shot learning (FSL), and, in particular, meta-learning that learns to learn with few data from related tasks. Despite the practical success of meta-learning, many of its algorithmic solutions proposed in the literature are based on sound intuitions, but lack a solid theoretical analysis of the expected performance on the test task. In this paper, we review the recent advances in meta-learning theory and show how they can be used in practice both to better understand the behavior of popular meta-learning algorithms and to improve their generalization capacity. This latter is achieved by integrating the theoretical assumptions ensuring efficient meta-learning in the form of regularization terms into several popular meta-learning algorithms for which we provide a large study of their behavior on classic few-shot classification benchmarks. To the best of our knowledge, this is the first contribution that puts the most recent learning bounds of meta-learning theory into practice for the popular task of few-shot classification. ",
+    "title": "Putting Theory to Work: From Learning Bounds to Meta-Learning Algorithms",
+    "authors": [
+      "Quentin Bouniot",
+      "Ievgen Redko",
+      "Romaric Audigier",
+      "Ang\u00e9lique Loesch",
+      "Amaury Habrard"
+    ],
+    "emails": [
+      "~Ievgen_Redko2",
+      "cea.fr",
+      "~Quentin_Bouniot1",
+      "~Amaury_Habrard1"
+    ],
+    "rank": 2311
+  },
+  {
+    "url": "https://openreview.net/forum?id=HW4aTJHx0X",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "With thousands of academic articles shared on a daily basis, it has become increasingly difficult to keep up with the latest scientific findings. To overcome this problem, we introduce a new task of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D467 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">disentangled paper summarization</mtext></math></mjx-assistive-mml></mjx-container>, which seeks to generate separate summaries for the paper contributions and the context of the work, making it easier to identify the key findings shared in articles. For this purpose, we extend the S2ORC corpus of academic articles, which spans a diverse set of domains ranging from economics to psychology, by adding disentangled \"contribution\" and \"context\" reference labels. Together with the dataset, we introduce and analyze three baseline approaches: 1) a unified model controlled by input code prefixes, 2) a model with separate generation heads specialized in generating the disentangled outputs, and 3) a training strategy that guides the model using additional supervision coming from inbound and outbound citations. We also propose a comprehensive automatic evaluation protocol which reports the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">relevance</mtext></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">novelty</mtext></math></mjx-assistive-mml></mjx-container>, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">disentanglement</mtext></math></mjx-assistive-mml></mjx-container> of generated outputs. Through a human study involving expert annotators, we show that in 79%, of cases our new task is considered more helpful than traditional scientific paper summarization.\n",
+    "title": "What's new? Summarizing Contributions in Scientific Literature",
+    "authors": [
+      "Hiroaki Hayashi",
+      "Wojciech Maciej Kryscinski",
+      "Bryan McCann",
+      "Nazneen Rajani",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Nazneen_Rajani1",
+      "~Bryan_McCann1",
+      "~Wojciech_Maciej_Kryscinski1",
+      "~Hiroaki_Hayashi1",
+      "~Caiming_Xiong1"
+    ],
+    "rank": 2312
+  },
+  {
+    "url": "https://openreview.net/forum?id=KiFeuZu24k",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.47",
+    "confidences": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Recently, a series of works in computer vision have shown promising results on various image and video understanding tasks using self-attention. However, due to the quadratic computational and memory complexities of self-attention, these works either apply attention only to low-resolution feature maps in later stages of a deep network or restrict the receptive field of attention in each layer to a small local region. To overcome these limitations, this work introduces a new global self-attention module, referred to as the GSA module, which is efficient enough to serve as the backbone component of a deep network. This module consists of two parallel layers: a content attention layer that attends to pixels based only on their content and a positional attention layer that attends to pixels based on their spatial locations. The output of this module is the sum of the outputs of the two layers. Based on the proposed GSA module, we introduce new standalone global attention-based deep networks that use GSA modules instead of convolutions to model pixel interactions. Due to the global extent of the proposed GSA module, a GSA network has the ability to model long-range pixel interactions throughout the network. Our experimental results show that GSA networks outperform the corresponding convolution-based networks significantly on the CIFAR-100 and ImageNet datasets while using less number of parameters and computations. The proposed GSA networks also outperform various existing attention-based networks on the ImageNet dataset.",
+    "title": "Global Self-Attention Networks for Image Recognition",
+    "authors": [
+      "Zhuoran Shen",
+      "Irwan Bello",
+      "Raviteja Vemulapalli",
+      "Xuhui Jia",
+      "Ching-Hui Chen"
+    ],
+    "emails": [
+      "~Zhuoran_Shen1",
+      "~Raviteja_Vemulapalli1",
+      "~Irwan_Bello1",
+      "~Ching-Hui_Chen2",
+      "~Xuhui_Jia1"
+    ],
+    "rank": 2313
+  },
+  {
+    "url": "https://openreview.net/forum?id=ALSupSRaBH",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Clustering and prediction are two primary tasks in the fields of unsupervised and supervised learning, respectively. Although much of the recent advances in machine learning have been centered around those two tasks, the interdependent, mutually beneficial relationship between them is rarely explored. One could reasonably expect appropriately clustering the data would aid the downstream prediction task and, conversely, a better prediction performance for the downstream task could potentially inform a more appropriate clustering strategy. In this work, we focus on the latter part of this mutually beneficial relationship. To this end, we introduce Deep Goal-Oriented Clustering (DGC), a probabilistic framework that clusters the data by jointly using supervision via side-information and unsupervised modeling of the inherent data structure in an end-to-end fashion. We show the effectiveness of our model on a range of datasets by achieving prediction accuracies comparable to the state-of-the-art, while, more importantly in our setting, simultaneously learning congruent clustering strategies.",
+    "title": "Deep Goal-Oriented Clustering",
+    "authors": [
+      "Yifeng Shi",
+      "Christopher M Bender",
+      "Linnea Olsson",
+      "Melissa Troester",
+      "Katherine A Hoadley",
+      "Junier Oliva",
+      "Marc Niethammer"
+    ],
+    "emails": [
+      "med.unc.edu",
+      "live.unc.edu",
+      "~Christopher_M_Bender1",
+      "~Junier_Oliva1",
+      "~Marc_Niethammer1",
+      "unc.edu",
+      "~Yifeng_Shi3"
+    ],
+    "rank": 2314
+  },
+  {
+    "url": "https://openreview.net/forum?id=5SST78xEh4A",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "To train robust deep neural networks (DNNs), we systematically study several target modification approaches, which include output regularisation, self and non-self label correction (LC). Two key issues are discovered: (1) Self LC is the most appealing as it exploits its own knowledge and requires no extra models. However, how to automatically decide the trust degree of a learner as training goes is not well answered in the literature? (2) Some methods penalise while the others reward low-entropy predictions, prompting us to ask which one is better?\n\nTo resolve the first issue, taking two well-accepted propositions\u2013deep neural networks learn meaningful patterns before fitting noise (Arpit et al., 2017) and minimum entropy regularisation principle (Grandvalet &amp; Bengio, 2006)\u2013we propose a novel end-to-end method named ProSelfLC, which is designed according to learning time and entropy. Specifically, given a data point, we progressively increase trust in its predicted label distribution versus its annotated one if a model has been trained for enough time and the prediction is of low entropy (high confidence). For the second issue, according to ProSelfLC, we empirically prove that it is better to redefine a meaningful low-entropy status and optimise the learner toward it. This serves as a defence of entropy minimisation. \n\nWe demonstrate the effectiveness of ProSelfLC through extensive experiments in both clean and noisy settings.",
+    "title": "ProSelfLC: Progressive Self Label Correction for Training Robust Deep Neural Networks",
+    "authors": [
+      "Xinshao Wang",
+      "Yang Hua",
+      "Elyor Kodirov",
+      "David A. Clifton",
+      "Neil M. Robertson"
+    ],
+    "emails": [
+      "~David_A._Clifton1",
+      "~Yang_Hua2",
+      "~Xinshao_Wang1",
+      "~Neil_M._Robertson1",
+      "~Elyor_Kodirov1"
+    ],
+    "rank": 2315
+  },
+  {
+    "url": "https://openreview.net/forum?id=XavM6v_q59q",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "As opposed to natural languages, source code understanding is influenced by grammar relations between tokens regardless of their identifier name. Considering graph representation of source code such as Abstract Syntax Tree (AST) and Control Flow Graph (CFG), can capture a token\u2019s grammatical relationships that are not obvious from the source code. Most existing methods are late fusion and underperform when supplementing the source code text with a graph representation. We propose a novel method called GN-Transformer to fuse representations learned from graph and text modalities under the Graph Networks (GN) framework with attention mechanism. Our method learns the embedding on a constructed graph called Syntax-Code Graph (SCG). We perform experiments on the structure of SCG, an ablation study on the model design and the hyper-paramaters to conclude that the performance advantage is from the fusion method and not the specific details of the model. The proposed method achieved state of the art performance in two code summarization datasets and across three metrics.",
+    "title": "GN-Transformer: Fusing AST and Source Code information in Graph Networks",
+    "authors": [
+      "Junyan Cheng",
+      "Iordanis Fostiropoulos",
+      "Barry Boehm"
+    ],
+    "emails": [
+      "usc.edu",
+      "~Barry_Boehm1",
+      "~Junyan_Cheng1"
+    ],
+    "rank": 2316
+  },
+  {
+    "url": "https://openreview.net/forum?id=nLYMajjctMh",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "We propose a new optimization formulation for training federated learning models. The standard formulation has the form of an empirical risk minimization problem constructed to find a single global model trained from the private data stored across all participating devices. In contrast, our formulation seeks an explicit trade-off between this traditional global model and the local models, which can be learned by each device from its own private data without any communication. Further, we develop several efficient variants of SGD (with and without partial participation and with and without variance reduction) for solving the new formulation and prove communication complexity guarantees. Notably, our methods are similar but not identical to federated averaging / local SGD, thus shedding some light on the essence of the elusive method. In particular, our methods do not perform full averaging steps and instead merely take steps towards averaging. We argue for the benefits of this new paradigm for federated learning.\n",
+    "title": "Federated Learning of a Mixture of Global and Local Models",
+    "authors": [
+      "Filip Hanzely",
+      "Peter Richtarik"
+    ],
+    "emails": [
+      "~Peter_Richtarik1",
+      "~Filip_Hanzely2"
+    ],
+    "rank": 2317
+  },
+  {
+    "url": "https://openreview.net/forum?id=w5bNwUzj33",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "In order to quickly adapt to new data, few-shot learning aims at learning from few examples, often by using already acquired knowledge. The new data often differs from the previously seen data due to a domain shift, that is, a change of the input-target distribution. While several methods perform well on small domain shifts like new target classes with similar inputs, larger domain shifts are still challenging. Large domain shifts may result in abstract concepts that are not shared between the original and the new domain. However, low-level concepts like edges in images might still be shared and useful. For cross-domain few-shot learning, we suggest representation fusion to unify different abstraction levels of a deep neural network into one representation. We propose Cross-domain Hebbian Ensemble Few-shot learning (CHEF), which consists of representation fusion by an ensemble of Hebbian learners acting on different layers of a deep neural network that was trained on the original domain. On the few-shot datasets miniImagenet and tieredImagenet, where the domain shift is small, CHEF is competitive with state-of-the-art methods. On cross-domain few-shot benchmark challenges with larger domain shifts, CHEF obtains state-of-the-art results in all categories. We further apply CHEF on a real-world cross-domain application in drug discovery. We consider a domain shift from bioactive molecules to environmental chemicals and drugs with twelve associated toxicity prediction tasks. On these tasks that are highly relevant for computational drug discovery, CHEF significantly outperforms all its competitors. ",
+    "title": "Cross-Domain Few-Shot Learning by Representation Fusion",
+    "authors": [
+      "Thomas Adler",
+      "Johannes Brandstetter",
+      "Michael Widrich",
+      "Andreas Mayr",
+      "David Kreil",
+      "Michael K Kopp",
+      "G\u00fcnter Klambauer",
+      "Sepp Hochreiter"
+    ],
+    "emails": [
+      "~Johannes_Brandstetter1",
+      "kreil.org",
+      "~G\u00fcnter_Klambauer1",
+      "~Andreas_Mayr2",
+      "~Michael_K_Kopp1",
+      "~Thomas_Adler1",
+      "~Michael_Widrich2",
+      "~Sepp_Hochreiter1"
+    ],
+    "rank": 2318
+  },
+  {
+    "url": "https://openreview.net/forum?id=VyENEGiEYAQ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      2,
+      5,
+      6
+    ],
+    "rating": "4.47",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Transformer has become ubiquitous in the deep learning field. One of the key ingredients that destined its success is the self-attention mechanism, which allows fully-connected contextual encoding over input tokens. \nHowever, despite its effectiveness in modeling short sequences, self-attention suffers when handling inputs with extreme long-range dependencies, as its complexity grows quadratically with respect to the sequence length.\nTherefore, long sequences are often encoded by Transformer in chunks using a sliding window.\nIn this paper, we propose Cluster-Former, a novel clustering-based sparse Transformer to perform attention across chunked sequences. The proposed framework is pivoted on two unique types of Transformer layer: Sliding-Window Layer and Cluster-Former Layer, which encode local sequence information and global context jointly and iteratively.\nThis new design allows information integration beyond local windows, which is especially beneficial for question answering (QA) tasks that rely on long-range dependencies. Experiments show that Cluster-Former achieves state-of-the-art performance on several major QA benchmarks.",
+    "title": "Cluster-Former: Clustering-based Sparse Transformer for Question Answering",
+    "authors": [
+      "Shuohang Wang",
+      "Luowei Zhou",
+      "Zhe Gan",
+      "Yen-Chun Chen",
+      "Yuwei Fang",
+      "Siqi Sun",
+      "Yu Cheng",
+      "Jingjing Liu"
+    ],
+    "emails": [
+      "~Jingjing_Liu2",
+      "~Yen-Chun_Chen1",
+      "microsoft.com",
+      "~Shuohang_Wang1",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1",
+      "~Siqi_Sun2",
+      "~Luowei_Zhou1"
+    ],
+    "rank": 2319
+  },
+  {
+    "url": "https://openreview.net/forum?id=fw1-fHJpPK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.46",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Knowledge graph (KG) representation learning methods have achieved competitive performance in many KG-oriented tasks, among which the best ones are usually based on graph neural networks (GNNs), a powerful family of networks that learns the representation of an entity by aggregating the features of its neighbors and itself. However, many KG representation learning scenarios only provide the structure information that describes the relationships among entities, causing that entities have no input features. In this case, existing aggregation mechanisms are incapable of inducing embeddings of unseen entities as these entities have no pre-defined features for aggregation. In this paper, we present a decentralized KG representation learning approach, decentRL, which encodes each entity from and only from the embeddings of its neighbors. For optimization, we design an algorithm to distill knowledge from the model itself such that the output embeddings can continuously gain knowledge from the corresponding original embeddings. Extensive experiments show that the proposed approach performed better than many cutting-edge models on the entity alignment task, and achieved competitive performance on the entity prediction task. Furthermore, under the inductive setting, it significantly outperformed all baselines on both tasks.",
+    "title": "Decentralized Knowledge Graph Representation Learning",
+    "authors": [
+      "Lingbing Guo",
+      "Weiqing Wang",
+      "Zequn Sun",
+      "Chenghao Liu",
+      "Wei Hu"
+    ],
+    "emails": [
+      "nju.edu.cn",
+      "monash.edu",
+      "~Chenghao_Liu1",
+      "~Lingbing_Guo1",
+      "~Zequn_Sun1"
+    ],
+    "rank": 2320
+  },
+  {
+    "url": "https://openreview.net/forum?id=zI38PZQHWKj",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      3
+    ],
+    "rating": "4.45",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Optimal transport is a machine learning problem with applications including distribution comparison, feature selection, and generative adversarial networks. In this paper, we propose feature-robust optimal transport (FROT) for high-dimensional data, which solves high-dimensional OT problems using feature selection to avoid the curse of dimensionality. Specifically, we find a transport plan with discriminative features. To this end, we formulate the FROT problem as a min--max optimization problem. We then propose a convex formulation of the FROT problem and solve it using a Frank--Wolfe-based optimization algorithm, whereby the subproblem can be efficiently solved using the Sinkhorn algorithm. Since FROT finds the transport plan from selected features, it is robust to noise features. To show the effectiveness of FROT, we propose using the FROT algorithm for the layer selection problem in deep neural networks for semantic correspondence. By conducting synthetic and benchmark experiments, we demonstrate that the proposed method can find a strong correspondence by determining important layers. We show that the FROT algorithm achieves state-of-the-art performance in real-world semantic correspondence datasets.",
+    "title": "Feature-Robust Optimal Transport for High-Dimensional Data",
+    "authors": [
+      "Mathis Petrovich",
+      "Chao Liang",
+      "Ryoma Sato",
+      "Yanbin Liu",
+      "Yao-Hung Hubert Tsai",
+      "Linchao Zhu",
+      "Yi Yang",
+      "Ruslan Salakhutdinov",
+      "Makoto Yamada"
+    ],
+    "emails": [
+      "~Yao-Hung_Hubert_Tsai1",
+      "~Yi_Yang4",
+      "~Ryoma_Sato1",
+      "~Linchao_Zhu1",
+      "zju.edu.cn",
+      "gmail.com",
+      "~Makoto_Yamada3",
+      "~Ruslan_Salakhutdinov1",
+      "~Yanbin_Liu1"
+    ],
+    "rank": 2321
+  },
+  {
+    "url": "https://openreview.net/forum?id=9nIulvlci5",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      7
+    ],
+    "rating": "4.45",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "The data representation plays an important role in evaluating similarity between objects. In this paper, we propose a novel approach for implicit data representation to evaluate similarity of input data using a trained neural network. In contrast to the previous approach, which uses gradients for representation, we utilize only the outputs from the last hidden layer of a neural network and do not use a backward step. The proposed technique explicitly takes into account the initial task and significantly reduces the size of the vector representation, as well as the computation time. Generally, a neural network obtains representations related only to the problem being solved, which makes the last hidden layer representation useless for input similarity task.\nIn this paper, we consider two reasons for the decline in the quality of representations: correlation between neurons and insufficient size of the last hidden layer. To reduce the correlation between neurons we use orthogonal weight initialization for each layer and modify the loss function to ensure orthogonality of the weights during training. Moreover, we show that activation functions can potentially increase correlation. To solve this problem, we apply modified Batch-Normalization with Dropout. Using orthogonal weight matrices allow us to consider such neural networks as an application of the Random Projection method and get a lower bound estimate for the size of the last hidden layer. We perform experiments on MNIST and physical examination datasets. In both experiments, initially, we split a set of labels into two disjoint subsets to train a neural network for binary classification problem, and then use this model to measure similarity between input data and define hidden classes. We also cluster the inputs to evaluate how well objects from the same hidden class are grouped together. Our experimental results show that the proposed approach achieves competitive results on the input similarity task while reducing both computation time and the size of the input representation.",
+    "title": "Neural Random Projection: From the Initial Task To the Input Similarity Problem",
+    "authors": [
+      "Alan Savushkin",
+      "Nikita Benkovich",
+      "Dmitry Golubev"
+    ],
+    "emails": [
+      "kaspersky.com",
+      "~Alan_Savushkin1"
+    ],
+    "rank": 2322
+  },
+  {
+    "url": "https://openreview.net/forum?id=--GJkm7nt0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.44",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We propose ENVISE,  an online distillation framework that ENhances VISual representations for Efficient object recognition. We are motivated by the observation that in many real-world scenarios, the probability of occurrence of all classes is not the same and only a subset of classes occur frequently. Exploiting this fact, we aim to reduce the computations of our framework by employing a binary student network (BSN) to learn the frequently occurring classes using the pseudo-labels generated by the teacher network (TN) on an unlabeled image stream. To maintain overall accuracy, the BSN must also accurately determine when a rare (or unknown) class is present in the image stream so that the TN can be used in such cases. To achieve this, we propose an attention triplet loss which ensures that the BSN emphasizes the same semantically meaningful regions of the image as the TN. When the prior class probabilities in the image stream vary, we demonstrate that the BSN adapts to the TN faster than the real-valued student network. We also introduce Gain in Efficiency (GiE), a new metric which estimates the relative reduction in FLOPS based on the number of times the BSN and TN are used to process the image stream. We benchmark CIFAR-100 and tiny-imagenet datasets by creating meaningful inlier (frequent) and outlier (rare) class pairs that mimic real-world scenarios. We show that ENVISE outperforms state-of-the-art (SOTA) outlier detection methods in terms of GiE, and also achieves greater separation between inlier and outlier classes in the feature space.",
+    "title": "Enhancing Visual Representations for Efficient Object Recognition during Online Distillation",
+    "authors": [
+      "Shashanka Venkataramanan",
+      "Bruce W McIntosh",
+      "Abhijit Mahalanobis"
+    ],
+    "emails": [
+      "~Abhijit_Mahalanobis1",
+      "~Bruce_W_McIntosh1",
+      "~Shashanka_Venkataramanan1"
+    ],
+    "rank": 2323
+  },
+  {
+    "url": "https://openreview.net/forum?id=_1ZLtKirOg3",
+    "ratings": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.44",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "There is an increasing interest in the use of automatic mathematical word problem (MWP) generation in educational assessment. Different from standard natural question generation, MWP generation needs to maintain the underlying mathematical operations between quantities and variables, while at the same time ensuring the relevance between the output and the given topic. To address above problem we develop an end-to-end neural model to generate personalized and diverse MWPs in real-world scenarios from commonsense knowledge graph and equations. The proposed model (1) learns both representations from edge-enhanced Levi graphs of symbolic equations and commonsense knowledge; (2) automatically fuses equation and commonsense knowledge information via a self-planning module when generating the MWPs. Experiments on an educational gold-standard set and a large-scale generated MWP set show that our approach is superior on the MWP generation task, and it outperforms the state-of-the-art models in terms of both automatic evaluation metrics, i.e., BLEU-4, ROUGE-L, Self-BLEU, and human evaluation metrics, i.e, equation relevance, topic relevance, and language coherence.",
+    "title": "Mathematical Word Problem Generation from Commonsense Knowledge Graph and Equations",
+    "authors": [
+      "Tianqiao Liu",
+      "Qiang Fang",
+      "Wenbiao Ding",
+      "Zhongqin Wu",
+      "Zitao Liu"
+    ],
+    "emails": [
+      "tal.com",
+      "~Zitao_Liu1",
+      "~Tianqiao_Liu1"
+    ],
+    "rank": 2324
+  },
+  {
+    "url": "https://openreview.net/forum?id=LT0KSFnQDWF",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      3,
+      4
+    ],
+    "rating": "4.44",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "While Graph Neural Networks (GNNs) have achieved remarkable results in a variety of applications, recent studies exposed important shortcomings in their ability to capture the structure of the underlying graph. It has been shown that the expressive power of standard GNNs is bounded by the Weisfeiler-Lehman (WL) graph isomorphism test, from which they inherit proven limitations such as the inability to detect and count graph substructures. On the other hand, there is significant empirical evidence, e.g. in network science and bioinformatics, that substructures are often informative for downstream tasks, suggesting that it is desirable to design GNNs capable of leveraging this important source of information. To this end, we propose a novel topologically-aware message passing scheme based on substructure encoding. We show that our architecture allows incorporating domain-specific inductive biases and that it is strictly more expressive than the WL test. Importantly, in contrast to recent works on the expressivity of GNNs, we do not attempt to adhere to the WL hierarchy; this allows us to retain multiple attractive properties of standard GNNs such as locality and linear network complexity, while being able to disambiguate even hard instances of graph isomorphism. We extensively evaluate our method on graph classification and regression tasks and show state-of-the-art results on multiple datasets including molecular graphs and social networks.",
+    "title": "Improving Graph Neural Network Expressivity via Subgraph Isomorphism Counting",
+    "authors": [
+      "Giorgos Bouritsas",
+      "Fabrizio Frasca",
+      "Stefanos Zafeiriou",
+      "Michael M. Bronstein"
+    ],
+    "emails": [
+      "~Stefanos_Zafeiriou1",
+      "~Fabrizio_Frasca1",
+      "~Michael_M._Bronstein1",
+      "~Giorgos_Bouritsas1"
+    ],
+    "rank": 2325
+  },
+  {
+    "url": "https://openreview.net/forum?id=EAZHurUYz8U",
+    "ratings": [
+      6,
+      5,
+      3
+    ],
+    "rating": "4.44",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "The inductive bias of a neural network is largely determined by the architecture and the training algorithm. To achieve good generalization, how to effectively train a neural network is of great importance. We propose a novel orthogonal over-parameterized training (OPT) framework that can provably minimize the hyperspherical energy which characterizes the diversity of neurons on a hypersphere. By maintaining the minimum hyperspherical energy during training, OPT can greatly improve the empirical generalization. Specifically, OPT fixes the randomly initialized weights of the neurons and learns an orthogonal transformation that applies to these neurons. We consider multiple ways to learn such an orthogonal transformation, including unrolling orthogonalization algorithms, applying orthogonal parameterization, and designing orthogonality-preserving gradient descent. For better scalability, we further propose the stochastic OPT which performs orthogonal transformation stochastically for partial dimensions of the neuron. Interestingly, OPT reveals that learning a proper coordinate system for neurons is crucial to generalization. We provide some insights on why OPT yields better generalization. Extensive experiments validate the superiority of OPT.",
+    "title": "Orthogonal Over-Parameterized Training",
+    "authors": [
+      "Weiyang Liu",
+      "Rongmei Lin",
+      "Zhen Liu",
+      "James Matthew Rehg",
+      "Li Xiong",
+      "Adrian Weller",
+      "Le Song"
+    ],
+    "emails": [
+      "~Li_Xiong1",
+      "~Le_Song1",
+      "~Adrian_Weller1",
+      "~James_Matthew_Rehg1",
+      "~Weiyang_Liu1",
+      "~Rongmei_Lin1",
+      "~Zhen_Liu6"
+    ],
+    "rank": 2326
+  },
+  {
+    "url": "https://openreview.net/forum?id=VcCZDjZiqFf",
+    "ratings": [
+      5,
+      3,
+      4,
+      6
+    ],
+    "rating": "4.44",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Deep neural networks (DNNs) have achieved amazing success on a wide range of high-level computer vision tasks. However, it is proved that DNNs are vulnerable to adversarial samples. The threat of adversarial samples comes from the large distribution gap between adversarial samples and clean samples in the feature spaces of the target DNNs. To this, we utilize deep generative networks with a novel training scheme to eliminate the distribution gap. Our training strategy introduces constraints in both pixel level as well as feature level, and the trained network can effectively align the distribution of adversarial samples with clean samples for target DNNs through translating their pixel values. Specifically, compared with previous methods, we propose a more efficient pixel-level training constraint to weaken the hardness of aligning adversarial samples to clean samples, which can thus obviously enhance the robustness on adversarial samples. Besides, a class-aware feature-level constraint is formulated for integrated distribution alignment. Our approach is general and suitable for multiple tasks like image classification, semantic segmentation and object detection. We conduct extensive experiments on these three tasks and different datasets, on which the superiority of our strategy over existing methods demonstrates its effectiveness and generality.",
+    "title": "General Adversarial Defense via Pixel Level and Feature Level Distribution Alignment",
+    "authors": [
+      "Xiaogang Xu",
+      "Hengshuang Zhao",
+      "Philip Torr",
+      "Jiaya Jia"
+    ],
+    "emails": [
+      "~Jiaya_Jia1",
+      "~Philip_Torr1",
+      "~Xiaogang_Xu2",
+      "~Hengshuang_Zhao2"
+    ],
+    "rank": 2327
+  },
+  {
+    "url": "https://openreview.net/forum?id=gnAGreyEzP2",
+    "ratings": [
+      5,
+      6,
+      3,
+      4
+    ],
+    "rating": "4.44",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "This paper explores an intriguing idea of recursively parameterizing recurrent nets. Simply speaking, this refers to recurrently controlling a recurrent network with recurrent networks controlled by recurrent networks. The proposed architecture recursively parameterizes its gating functions whereby gating mechanisms of X-RNN are controlled by instances of itself, which are repeatedly called in a recursive fashion. We postulate that our proposed inductive bias provides modeling benefits pertaining to learning with inherently hierarchically-structured sequence data. To this end, we conduct extensive experiments on recursive logic tasks (sorting, tree traversal, logical inference), sequential pixel-by-pixel classification, semantic parsing, code generation, machine translation and polyphonic music modeling, demonstrating the widespread utility of the proposed approach, i.e., achieving optimistic and competitive results on all tasks.",
+    "title": "Recurrently Controlling a Recurrent Network with Recurrent Networks Controlled by More Recurrent Networks",
+    "authors": [
+      "Yi Tay",
+      "Yikang Shen",
+      "Alvin Chan",
+      "Aston Zhang",
+      "Shuai Zhang"
+    ],
+    "emails": [
+      "~Aston_Zhang2",
+      "~Alvin_Chan1",
+      "~Yi_Tay1",
+      "~Yikang_Shen1",
+      "~Shuai_Zhang7"
+    ],
+    "rank": 2328
+  },
+  {
+    "url": "https://openreview.net/forum?id=hKps4HGGGx",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.44",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Adversarial examples easily mislead the vision systems based on deep neural networks (DNNs) trained with the softmax cross entropy (SCE) loss. Such a vulnerability of DNN comes from the fact that SCE drives DNNs to fit on the training samples, whereas the resultant feature distributions between the training and adversarial examples are unfortunately misaligned. Several state-of-the-arts start from improving the inter-class separability of training samples by modifying loss functions, where we argue that the adversarial examples are ignored and thus limited robustness to adversarial attacks is resulted. In this paper, we exploit inference region which inspires us to involve a margin-like inference information to SCE, resulting in a novel inference-softmax cross entropy (I-SCE) loss, which is intuitively appealing and interpretable. The inference information is a guarantee to both the inter-class separability and the improved generalization to adversarial examples, which is furthermore demonstrated under the min-max framework. Extensive experiments show that under strong adaptive attacks, the DNN models trained with the proposed I-SCE loss achieve superior performance and robustness over the state-of-the-arts.",
+    "title": "Improving robustness of softmax corss-entropy loss via inference information",
+    "authors": [
+      "Bingbing Song",
+      "Wei He",
+      "Renyang Liu",
+      "Shui Yu",
+      "Ruxin Wang",
+      "Mingming Gong",
+      "Tongliang Liu",
+      "Wei Zhou"
+    ],
+    "emails": [
+      "~Mingming_Gong1",
+      "ynu.edu.cn",
+      "~Tongliang_Liu1",
+      "~Bingbing_Song1",
+      "~Ruxin_Wang1",
+      "mail.ynu.edu.cn",
+      "~Shui_Yu1"
+    ],
+    "rank": 2329
+  },
+  {
+    "url": "https://openreview.net/forum?id=xQLKXw1nhiD",
+    "ratings": [
+      3,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.44",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Non-visual imaging sensors are widely used in the industry for different purposes. Those sensors are more expensive than visual (RGB) sensors, and usually produce images with lower resolution. To this end, Cross-Modality Super-Resolution methods were introduced, where an RGB image of a high-resolution assists in increasing the resolution of a low-resolution modality. However, fusing images from different modalities is not a trivial task, since each multi-modal pair varies greatly in its internal correlations. For this reason, traditional state-of-the-arts which are trained on external datasets often struggle with yielding an artifact-free result that is still loyal to the target modality characteristics.\n\nWe present CMSR, a single-pair approach for Cross-Modality Super-Resolution. The network is internally trained on the two input images only, in a self-supervised manner, learns their internal statistics and correlations, and applies them to upsample the target modality. CMSR contains an internal transformer which is trained on-the-\ufb02y together with the up-sampling process itself and without supervision, to allow dealing with pairs that are only weakly aligned. We show that CMSR produces state-of-the-art super resolved images, yet without introducing artifacts or irrelevant details that originate from the RGB image only.",
+    "title": "Single Pair Cross-Modality Super Resolution",
+    "authors": [
+      "Guy Shacht",
+      "Sharon Fogel",
+      "Dov Danon",
+      "Ilya Leizerson",
+      "Daniel Cohen-or"
+    ],
+    "emails": [
+      "~Dov_Danon1",
+      "gmail.com",
+      "~Guy_Shacht1",
+      "~Daniel_Cohen-or2",
+      "elbitsystems.com"
+    ],
+    "rank": 2330
+  },
+  {
+    "url": "https://openreview.net/forum?id=G1KjzLWU4ci",
+    "ratings": [
+      4,
+      7,
+      5,
+      3
+    ],
+    "rating": "4.43",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "When a person identifies objects, he or she can think by associating objects to many classes and conclude by taking inter-class relations into account. This cognitive system can make a more reliable prediction. Inspired by these observations, we propose a new network training strategy to consider inter-class relations, namely LogitMix. Specifically, we use recent data augmentation techniques (e.g., Mixup, Manifold Mixup, or Cutmix) as baselines for generating mixed samples. Then, LogitMix suggests using the mixed logit (ie., the mixture of two logits) as an auxiliary training objective. Because using logit before softmax activation preserves rich class relationships, it can serve as a weak-supervision signal concerning inter-class relations. Our experimental results demonstrate that LogitMix achieves state-of-the-art performance among recent data augmentation techniques in terms of both calibration error and prediction accuracy. The source code is attached as the supplementary material.",
+    "title": "Logit As Auxiliary Weak-supervision for More Reliable and Accurate Prediction",
+    "authors": [
+      "Duhyeon Bang",
+      "Yunho Jeon",
+      "Jin-Hwa Kim",
+      "Jiwon Kim",
+      "Hyunjung Shim"
+    ],
+    "emails": [
+      "~Jin-Hwa_Kim1",
+      "~Hyunjung_Shim1",
+      "~Yunho_Jeon1",
+      "~Duhyeon_Bang1",
+      "~Jiwon_Kim3"
+    ],
+    "rank": 2331
+  },
+  {
+    "url": "https://openreview.net/forum?id=IpsTSvfIB6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      3,
+      1
+    ],
+    "abstract": "The Birkhoff-von-Neumann (BvN) decomposition is a standard tool used to draw permutation matrices from a doubly stochastic (DS) matrix. The BvN decomposition represents such a DS matrix as a convex combination of several permutation matrices. Currently, most algorithms to compute the BvN decomposition employ either greedy strategies or custom-made heuristics. In this paper, we present a novel differentiable approach to approximate the BvN decomposition. Our algorithm builds upon recent advances in Riemannian optimization on Birkhoff polytopes. We offer an empirical evaluation of this approach in the fairness of exposure in rankings, where we show that the outcome of our method behaves similarly to greedy algorithms. Our approach is an excellent addition to existing methods for sampling from DS matrices, such as sampling from a Gumbel-Sinkhorn distribution. However, our approach is better suited for applications where the latency in prediction time is a constraint. Indeed, we can generally precompute an approximated BvN decomposition offline. Then, we select a permutation matrix at random with probability proportional to its coefficient. Finally, we provide an implementation of our method.",
+    "title": "Approximate Birkhoff-von-Neumann decomposition: a differentiable approach",
+    "authors": [
+      "Andr\u00e9s Hoyos-Idrobo"
+    ],
+    "emails": [
+      "~Andr\u00e9s_Hoyos-Idrobo1"
+    ],
+    "rank": 2332
+  },
+  {
+    "url": "https://openreview.net/forum?id=yeeS_HULL7Z",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "In machine learning, no data point stands alone. We believe that context is an underappreciated concept in many machine learning methods. We propose Attention-Based Clustering (ABC), a neural architecture based on the attention mechanism, which is designed to learn latent representations that adapt to context within an input set, and which is inherently agnostic to input sizes and number of clusters. By learning a similarity kernel, our method directly combines with any out-of-the-box kernel-based clustering approach. We present competitive results for clustering Omniglot characters and include analytical evidence of the effectiveness of an attention-based approach for clustering. ",
+    "title": "Attention-Based Clustering: Learning a Kernel from Context",
+    "authors": [
+      "Samuel Coward",
+      "Erik Visse-Martindale",
+      "Chithrupa Ramesh"
+    ],
+    "emails": [
+      "~Samuel_Coward1",
+      "uk.zuken.com",
+      "~Chithrupa_Ramesh1"
+    ],
+    "rank": 2333
+  },
+  {
+    "url": "https://openreview.net/forum?id=6BWY3yDdDi",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      7,
+      5
+    ],
+    "rating": "4.43",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Softmax classifiers with a very large number of classes naturally occur in many applications such as natural language processing and information retrieval. The calculation of full-softmax is very expensive from the computational and energy perspective. There have been a variety of sampling approaches to overcome this challenge, popularly known as negative sampling (NS). Ideally, NS should sample negative classes from a distribution that is dependent on the input data, the current parameters, and the correct positive class. Unfortunately, due to the dynamically updated parameters and data samples, there does not exist any sampling scheme that is truly adaptive and also samples the negative classes in constant time every iteration. Therefore, alternative heuristics like random sampling,  static frequency-based sampling, or learning-based biased sampling; which primarily trade either the sampling cost or the adaptivity of samples per iteration, are adopted. In this paper, we show a class of distribution where the sampling scheme is truly adaptive and provably generates negative samples in constant time. We demonstrate a negative sampling implementation that is significantly faster, in terms of wall clock time, compared to the most optimized TensorFlow implementations of standard softmax or other sampling approaches on the best available GPUs (V100s).",
+    "title": "A Truly Constant-time Distribution-aware Negative Sampling",
+    "authors": [
+      "Shabnam Daghaghi",
+      "Tharun Medini",
+      "Beidi Chen",
+      "Mengnan Zhao",
+      "Anshumali Shrivastava"
+    ],
+    "emails": [
+      "~Tharun_Medini1",
+      "~Beidi_Chen1",
+      "rice.edu",
+      "~Shabnam_Daghaghi1",
+      "~Anshumali_Shrivastava1"
+    ],
+    "rank": 2334
+  },
+  {
+    "url": "https://openreview.net/forum?id=rd_bm8CK7o0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Sample efficiency and performance in the offline setting have emerged as among the main\nchallenges of deep reinforcement learning. We introduce Q-Value Weighted Regression (QWR),\na simple RL algorithm that excels in these aspects.\n\nQWR is an extension of Advantage Weighted Regression (AWR), an off-policy actor-critic algorithm\nthat performs very well on continuous control tasks, also in the offline setting, but struggles\non tasks with discrete actions and in sample efficiency. We perform a theoretical analysis\nof AWR that explains its shortcomings and use the insights to motivate QWR theoretically.\n\nWe show experimentally that QWR matches state-of-the-art algorithms both on tasks with\ncontinuous and discrete actions. We study the main hyperparameters of QWR\nand find that it is stable in a wide range of their choices and on different tasks.\nIn particular, QWR yields results on par with SAC on the MuJoCo suite and - with\nthe same set of hyperparameters -- yields results on par with a highly tuned Rainbow\nimplementation on a set of Atari games. We also verify that QWR performs well in the\noffline RL setting, making it a compelling choice for reinforcement learning in domains\nwith limited data.",
+    "title": "Q-Value Weighted Regression: Reinforcement Learning with Limited Data",
+    "authors": [
+      "Piotr Kozakowski",
+      "Lukasz Kaiser",
+      "Henryk Michalewski",
+      "Afroz Mohiuddin",
+      "Katarzyna Ka\u0144ska"
+    ],
+    "emails": [
+      "~Lukasz_Kaiser1",
+      "~Henryk_Michalewski1",
+      "google.com",
+      "~Piotr_Kozakowski3",
+      "~Afroz_Mohiuddin1"
+    ],
+    "rank": 2335
+  },
+  {
+    "url": "https://openreview.net/forum?id=CzRSsOG6JDw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.43",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Algorithms inferring rewards from human behavior typically assume that people are (approximately) rational. In reality, people exhibit a wide array of irrationalities. Motivated by understanding the benefits of modeling these irrationalities, we analyze the effects that demonstrator irrationality has on reward inference. We propose operationalizing several forms of irrationality in the language of MDPs, by altering the Bellman optimality equation, and use this framework to study how these alterations affect inference. \n\nWe find that incorrectly assuming noisy-rationality for an irrational demonstrator can lead to remarkably poor reward inference accuracy, even in situations where inference with the correct model leads to good inference. This suggests a need to either model irrationalities or find reward inference algorithms that are more robust to misspecification of the demonstrator model. Surprisingly, we find that if we give the learner access to the correct model of the demonstrator's irrationality, these irrationalities can actually help reward inference. In other words, if we could choose between a world where humans were perfectly rational and the current world where humans have systematic biases, the current world might counter-intuitively be preferable for reward inference. We reproduce this effect in several domains. While this finding is mainly conceptual, it is perhaps actionable as well: we might ask human demonstrators for myopic demonstrations instead of optimal ones, as they are more informative for the learner and might be easier for a human to generate.",
+    "title": "The impacts of known and unknown demonstrator irrationality on reward inference",
+    "authors": [
+      "Lawrence Chan",
+      "Andrew Critch",
+      "Anca Dragan"
+    ],
+    "emails": [
+      "~Andrew_Critch1",
+      "~Anca_Dragan1",
+      "~Lawrence_Chan2"
+    ],
+    "rank": 2336
+  },
+  {
+    "url": "https://openreview.net/forum?id=MDX3F0qAfm3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.43",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "The recent theoretical investigation (Li et al., 2020) on the upper bound of generalization error of deep neural networks (DNNs) demonstrates the potential of using the gradient norm as a measure that complements validation accuracy for model selection in practice.  In this work, we carry out empirical studies using several commonly-used neural network architectures and benchmark datasets to understand the effectiveness and efficiency of using gradient norm as the model selection criterion, especially in the settings of hyper-parameter optimization. While strong correlations between the generalization error and the gradient norm measures have been observed, we find the computation of gradient norm is time consuming due to the high gradient complexity. To balance the trade-off between efficiency and effectiveness, we propose to use an accelerated approximation (Goodfellow, 2015)  of gradient norm that only computes the loss gradient in the Fully-Connected Layer (FC Layer) of DNNs with significantly reduced computation cost (200~20,000 times faster). Our empirical studies clearly find that the use of approximated gradient norm, as one of the hyper-parameter search objectives, can select the models with lower generalization error, but the efficiency is still low (marginal accuracy improvement but with high computation overhead). Our results also show that the bandit-based or population-based algorithms, such as BOHB, perform poorer with gradient norm objectives, since the correlation between gradient norm and generalization error is not always consistent across phases of the training process. Finally, gradient norm also fails to predict the generalization performance of models based on different architectures, in comparison with state of the art algorithms and metrics.",
+    "title": "Can We Use Gradient Norm as a Measure of Generalization Error for Model Selection in Practice?",
+    "authors": [
+      "Haozhe An",
+      "Haoyi Xiong",
+      "Xuhong Li",
+      "Xingjian Li",
+      "Dejing Dou",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "~Xingjian_Li1",
+      "~Haoyi_Xiong1",
+      "~Haozhe_An1",
+      "~Zhanxing_Zhu1",
+      "~Dejing_Dou1",
+      "~Xuhong_Li3"
+    ],
+    "rank": 2337
+  },
+  {
+    "url": "https://openreview.net/forum?id=PO0SuuafSX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      5,
+      2,
+      4
+    ],
+    "abstract": "Some forms of novel visual media enable the viewer to explore a 3D scene from essentially arbitrary viewpoints, by interpolating between a discrete set of original views. Compared to 2D imagery, these types of applications require much larger amounts of storage space, which we seek to reduce. Existing approaches for compressing 3D scenes are based on a separation of compression and rendering: each of the original views is compressed using traditional 2D image formats; the receiver decompresses the views and then performs the rendering. We unify these steps by directly compressing an implicit representation of the scene, a function that maps spatial coordinates to a radiance vector field, which can then be queried to render arbitrary viewpoints. The function is implemented as a neural network and jointly trained for reconstruction as well as compressibility, in an end-to-end manner, with the use of an entropy penalty on the parameters. Our method significantly outperforms a state-of-the-art conventional approach for scene compression, achieving simultaneously higher quality reconstructions and lower bitrates. Furthermore, we show that the performance at lower bitrates can be improved by jointly representing multiple scenes using a soft form of parameter sharing.",
+    "title": "3D Scene Compression through Entropy  Penalized Neural Representation Functions",
+    "authors": [
+      "Thomas Bird",
+      "Johannes Ball\u00e9",
+      "Saurabh Singh",
+      "Philip Chou"
+    ],
+    "emails": [
+      "~Thomas_Bird1",
+      "~Johannes_Ball\u00e91",
+      "~Philip_Chou1",
+      "~Saurabh_Singh1"
+    ],
+    "rank": 2338
+  },
+  {
+    "url": "https://openreview.net/forum?id=R7aFOrR0b2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.43",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Neural networks are known to be data-hungry, and collecting large labeled datasets is often a crucial step in deep learning deployment. Researchers have studied dataset aspects such as distributional shift and labeling cost, primarily using downstream prediction accuracy for evaluation. In sensitive real-world applications such as medicine and self-driving cars, not only is the accuracy important, but also the calibration -- the extent that model uncertainty reflects the actual correctness likelihood. It has recently been shown that modern neural networks are ill-calibrated. In this work, we take a complementary approach -- studying how dataset properties, rather than architecture, affects calibration. For the common issue of dataset imbalance, we show that calibration varies significantly among classes, even when common strategies to mitigate class imbalance are employed. We also study the effects of label quality, showing how label noise dramatically increases calibration error. Furthermore, poor calibration can come from small dataset sizes, which we motive via results on network expressivity. Our experiments demonstrate that dataset properties can significantly affect calibration and suggest that calibration should be measured during dataset curation.",
+    "title": "Dataset Curation Beyond Accuracy",
+    "authors": [
+      "Johan Bjorck",
+      "Carla P Gomes"
+    ],
+    "emails": [
+      "~Carla_P_Gomes1",
+      "~Johan_Bjorck2"
+    ],
+    "rank": 2339
+  },
+  {
+    "url": "https://openreview.net/forum?id=gMRmCMoObrW",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Disentanglement has been a central task in representation learning, which involves learning interpretable factors of variation in data. Recent efforts in this direction have been devoted to the identifiability problem of deep latent-variable model with the theory of nonlinear ICA, i.e. the true latent variables can be identified or recovered by the encoder. These identifiability results in nonlinear ICA are essentially based on supervised learning. This work extends these results to the scenario of self-supervised learning. First, we point out that a broad types of augmented data can be generated from a latent model. Based on this, we prove an identifiability theorem similar to the work by~\\citep{khemakhem2019variational}: the latent variables for generating augmented data can be identified with some mild conditions. According to our proposed theory, we perform experiments on synthetic data and EMNIST with GIN~\\citep{sorrenson2020disentanglement}. In our experiments, we find that even the data is only augmented along a few latent variables, more latent variables can be identified, and adding a small noise in data space can stabilize this outcome. Based on this, we augment digit images on EMNIST simply with three affine transformations and then add small Gaussian noise. It is shown that much more interpretable factors of variation can be successfully identified.",
+    "title": "Self-supervised Disentangled Representation Learning",
+    "authors": [
+      "Xiaojiang Yang",
+      "Yitong Sun",
+      "Junchi Yan"
+    ],
+    "emails": [
+      "~Junchi_Yan2",
+      "~Xiaojiang_Yang1",
+      "~Yitong_Sun1"
+    ],
+    "rank": 2340
+  },
+  {
+    "url": "https://openreview.net/forum?id=TmkN9JmDJx1",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "What is the computational model behind a transformer? Where recurrent neural networks have direct parallels in finite state machines, allowing clear discussion and thought around architecture variants or trained models, transformers have no such familiar parallel. In this paper we aim to change that, proposing a computational model for the transformer-encoder in the form of a programming language. We map the basic components of a transformer-encoder \u2013 attention and feed-forward computation \u2013 into the simple primitives of select, aggregate, and zipmap, around which we form a programming language: the Restricted Access Sequence Process-ing Language (RASP). We show how RASP can be used to program solutions to tasks that could conceivably be learned by a transformer, augmenting it with tools we discover in our work. In particular, we provide RASP programs for histograms, sorting, and even logical inference similar to that of Clark et al. (2020). We further use our model to relate their difficulty in terms of the number of required layers and attention heads. Finally, we see how insights gained from our abstraction might be used to explain phenomena seen in recent works.",
+    "title": "Thinking Like Transformers",
+    "authors": [
+      "Gail Weiss",
+      "Yoav Goldberg",
+      "Eran Yahav"
+    ],
+    "emails": [
+      "~Eran_Yahav1",
+      "~Yoav_Goldberg1",
+      "~Gail_Weiss1"
+    ],
+    "rank": 2341
+  },
+  {
+    "url": "https://openreview.net/forum?id=GiEyS3CFHV_",
+    "ratings": [
+      5,
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.43",
+    "confidences": [
+      3,
+      4,
+      3,
+      2,
+      2
+    ],
+    "abstract": "Constructing non-vacuous PAC-Bayes bounds on generalization errors for un- bounded risk functionals, especially in the non-asymptotic regime, is an active area of research. However, current state of the art results are applicable only in some very specialized cases. In this work, we give an integrability condition which exactly characterizes when any risk functional, for a given data set and model space, admits such bounds using the Levy-Khintchine theorem. Further, we de- rive a Bahadur-Rao type exact asymptotic bound, which is much sharper than a traditional Chernoff type inequality, especially in the under-sampled regime. These bounds give us the flexibility to construct data or model-dependent consistency promoting updates to a data-free prior, which provably improves the generalization performance.",
+    "title": "Non-Asymptotic PAC-Bayes Bounds on Generalisation Error",
+    "authors": [
+      "Arijit Das"
+    ],
+    "emails": [
+      "~Arijit_Das2"
+    ],
+    "rank": 2342
+  },
+  {
+    "url": "https://openreview.net/forum?id=cQyybLUoXxc",
+    "ratings": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.42",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "we have withdrawn our paper.",
+    "title": "Withdraw",
+    "authors": [
+      "Matthew Thorpe",
+      "Bao Wang"
+    ],
+    "emails": [
+      "manchester.ac.uk",
+      "~Bao_Wang1"
+    ],
+    "rank": 2343
+  },
+  {
+    "url": "https://openreview.net/forum?id=WUTkGqErZ9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5
+    ],
+    "rating": "4.42",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "When seeing a new object, humans can immediately recognize it across different retinal locations: we say that the internal object representation is invariant to translation. It is commonly believed that Convolutional Neural Networks (CNNs) are architecturally invariant to translation thanks to the convolution and/or pooling operations they are endowed with. In fact, several works have found that these networks systematically fail to recognise new objects on untrained locations. In this work we show how, even though CNNs are not 'architecturally invariant' to translation, they can indeed 'learn' to be invariant to translation. We verified that this can be achieved by pretraining on ImageNet, and we found that it is also possible with much simpler datasets in which the items are fully translated across the input canvas. Significantly, simply training everywhere on the canvas was not enough. We investigated how this pretraining affected the internal network representations, finding that the invariance was almost always acquired, even though it was some times disrupted by further training due to catastrophic forgetting/interference. \n These experiments show how pretraining a network on an environment with the right 'latent' characteristics (a more naturalistic environment) can result in the network learning deep perceptual rules which would dramatically improve subsequent generalization.",
+    "title": "Convolutional Neural Networks are not invariant to translation, but they can learn to be",
+    "authors": [
+      "Valerio Biscione",
+      "Jeffrey Bowers"
+    ],
+    "emails": [
+      "~Jeffrey_Bowers1",
+      "~Valerio_Biscione1"
+    ],
+    "rank": 2344
+  },
+  {
+    "url": "https://openreview.net/forum?id=I8nahMfPixC",
+    "ratings": [
+      6,
+      3,
+      2,
+      7
+    ],
+    "rating": "4.42",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Due to vulnerability of machine learning algorithms under adversarial examples, it is challenging to defend against them. Recently, various defenses have been proposed to mitigate negative effects of adversarial examples generated from known attacks. However, these methods have obvious limitations against unknown attacks. Cognitive science investigates that the brain can recognize the same person with any expression by extracting invariant information on the face. Similarly, different adversarial examples share the invariant information retained from original examples. Motivated by this observation, we propose a defense framework ADD-Defense, which extracts the invariant information called \\textit{perturbation-invariant representation} (PIR) to defend against widespread adversarial examples. Specifically, realized by adversarial training with additional ability to utilize perturbation-specific information, the PIR is invariant to known attacks and has no perturbation-specific information. Facing the imbalance between widespread unknown attacks and limited known attacks, the PIR is expected to generalize well on unknown attacks via being matched to a Gaussian prior distribution. In this way, the PIR is invariant to both known and unknown attacks. Once the PIR is learned, we can generate an example without malicious perturbations as the output. We evaluate our ADD-Defense using various pixel-constrained and spatially-constrained attacks, especially BPDA and AutoAttack. The empirical results illustrate that our ADD-Defense is robust to widespread adversarial examples.",
+    "title": "ADD-Defense: Towards Defending Widespread Adversarial Examples via Perturbation-Invariant Representation",
+    "authors": [
+      "Dawei Zhou",
+      "Tongliang Liu",
+      "Bo Han",
+      "Nannan Wang",
+      "Xinbo Gao"
+    ],
+    "emails": [
+      "~Nannan_Wang1",
+      "~Dawei_Zhou3",
+      "~Xinbo_Gao3",
+      "~Bo_Han1",
+      "~Tongliang_Liu1"
+    ],
+    "rank": 2345
+  },
+  {
+    "url": "https://openreview.net/forum?id=lfJpQn3xPV-",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.41",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Online learning of graph neural networks (GNNs) faces the challenges of distribution shift and ever growing and changing training data, when temporal graphs evolve over time. This makes it inefficient to train over the complete graph whenever new data arrives. Deleting old data at some point in time may be preferable to maintain a good performance and to account for distribution shift. We systematically analyze these issues by incrementally training and evaluating GNNs in a sliding window over temporal graphs. We experiment with three representative GNN architectures and two scalable GNN techniques, on three new datasets. In our experiments, the GNNs face the challenge that new vertices, edges, and even classes appear and disappear over time.  Our results show that no more than 50% of the GNN's receptive field is necessary to retain at least 95% accuracy compared to training over a full graph. In most cases, i.e., 14 out 18 experiments, we even observe that a temporal window of size 1 is sufficient to retain at least 90%.",
+    "title": "Online Learning of Graph Neural Networks: When Can Data Be Permanently Deleted",
+    "authors": [
+      "Lukas Paul Achatius Galke",
+      "Benedikt Franke",
+      "Tobias Zielke",
+      "Ansgar Scherp"
+    ],
+    "emails": [
+      "~Lukas_Paul_Achatius_Galke1",
+      "~Ansgar_Scherp1",
+      "uni-ulm.de"
+    ],
+    "rank": 2346
+  },
+  {
+    "url": "https://openreview.net/forum?id=AFm2njNEE1",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.41",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Differentiable neural architecture search (NAS) has gained much success in discovering more \ufb02exible and diverse cell types. Current methods couple the operations and topology during search, and simply derive optimal topology by a hand-craft rule. However, topology also matters for neural architectures since it controls the interactions between features of operations. In this paper, we highlight the topology learning in differentiable NAS, and propose an explicit topology modeling method, named TopoNAS, to directly decouple the operation selection and topology during search. Concretely, we introduce a set of topological variables and a combinatorial probabilistic distribution to explicitly indicate the target topology. Besides, we also leverage a passive-aggressive regularization to suppress invalid topology within supernet. Our introduced topological variables can be jointly learned with operation variables and supernet weights, and apply to various DARTS variants. Extensive experiments on CIFAR-10 and ImageNet validate the effectiveness of our proposed TopoNAS. The results show that TopoNAS does enable to search cells with more diverse and complex topology, and boost the performance signi\ufb01cantly. For example, TopoNAS can improve DARTS by 0.16% accuracy on CIFAR-10 dataset with 40% parameters reduced or 0.35% with similar parameters.",
+    "title": "Explicit Learning Topology for Differentiable Neural Architecture Search",
+    "authors": [
+      "Tao Huang",
+      "Shan You",
+      "Yibo Yang",
+      "Zhuozhuo Tu",
+      "Fei Wang",
+      "Chen Qian",
+      "Changshui Zhang"
+    ],
+    "emails": [
+      "~Chen_Qian1",
+      "~Tao_Huang5",
+      "~Changshui_Zhang1",
+      "~Zhuozhuo_Tu1",
+      "~Yibo_Yang2",
+      "~Shan_You3",
+      "~Fei_Wang9"
+    ],
+    "rank": 2347
+  },
+  {
+    "url": "https://openreview.net/forum?id=S9MPX7ejmv",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.41",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many real-word decision or control problems involve multiple conflicting objectives and uncertainties, which requires learned policies are not only Pareto optimal but also robust. In this paper, we proposed a novel algorithm to approximate a representation for robust Pareto frontier through Bayesian-optimization-directed robust multi-objective reinforcement learning (BRMORL). Firstly, environmental uncertainty is modeled as an adversarial agent over the entire space of preferences by incorporating zero-sum game into multi-objective reinforcement learning (MORL). Secondly, a comprehensive metric based on hypervolume and information entropy is presented to evaluate convergence, diversity and evenness of the distribution for Pareto solutions. Thirdly, the agent\u2019s learning process is regarded as a black-box, and the comprehensive metric we proposed is computed after each episode of training, then a Bayesian optimization (BO) algorithm is adopted to guide the agent to evolve towards improving the quality of the approximated Pareto frontier. Finally, we demonstrate the effectiveness of proposed approach on challenging multi-objective tasks across four environments, and show our scheme can produce robust policies under environmental uncertainty.",
+    "title": "Approximating Pareto Frontier through Bayesian-optimization-directed Robust Multi-objective Reinforcement Learning",
+    "authors": [
+      "Xiangkun He",
+      "Jianye HAO",
+      "Dong Li",
+      "Bin Wang",
+      "Wulong Liu"
+    ],
+    "emails": [
+      "~Bin_Wang12",
+      "~Wulong_Liu1",
+      "~Jianye_HAO1",
+      "~Xiangkun_He1",
+      "~Dong_Li10"
+    ],
+    "rank": 2348
+  },
+  {
+    "url": "https://openreview.net/forum?id=IjIzIOkK2D6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.41",
+    "confidences": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "abstract": "Recently, graph neural networks (GNN) have been demonstrated effective in various graph-based tasks.  \nTo obtain state-of-the-art (SOTA) data-specific GNN architectures, researchers turn to the neural architecture search (NAS) methods. \nHowever, it remains to be a challenging problem to conduct efficient architecture search for GNN.\nIn this work, we present a novel framework for Efficient GrAph Neural architecture search (EGAN).\nBy designing a novel and expressive search space, an efficient one-shot NAS method based on stochastic relaxation and natural gradient is proposed.\nFurther, to enable architecture search in large graphs, a transfer learning paradigm is designed.\nExtensive experiments, including node-level and graph-level tasks, are conducted. The results show that the proposed EGAN can obtain SOTA data-specific architectures, and reduce the search cost by two orders of magnitude compared to existing NAS baselines.",
+    "title": "Efficient Graph Neural Architecture Search",
+    "authors": [
+      "Huan Zhao",
+      "Lanning Wei",
+      "quanming yao",
+      "Zhiqiang He"
+    ],
+    "emails": [
+      "~quanming_yao1",
+      "levono.com",
+      "~Huan_Zhao2",
+      "~Lanning_Wei1"
+    ],
+    "rank": 2349
+  },
+  {
+    "url": "https://openreview.net/forum?id=53WS781RzT9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      3
+    ],
+    "rating": "4.41",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We study mini-batch stochastic gradient descent (SGD) dynamics under linear regression and deep linear networks by focusing on the variance of the gradients only given the initial weights and mini-batch size, which is the first study of this nature. In the linear regression case, we show that in each iteration the norm of the gradient is a decreasing function of the mini-batch size <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>b</mi></math></mjx-assistive-mml></mjx-container> and thus the variance of the stochastic gradient estimator is a decreasing function of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>b</mi></math></mjx-assistive-mml></mjx-container>. For deep neural networks with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> loss we show that the variance of the gradient is a polynomial in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mrow><mo>/</mo></mrow><mi>b</mi></math></mjx-assistive-mml></mjx-container>. The results theoretically back the important intuition that smaller batch sizes yield larger variance of the stochastic gradients and lower loss function values which is a common believe among the researchers. The proof techniques exhibit a relationship between stochastic gradient estimators and initial weights, which is useful for further research on the dynamics of SGD. We empirically provide insights to our results on various datasets and commonly used deep network structures. We further discuss possible extensions of the approaches we build in studying the generalization ability of the deep learning models.",
+    "title": "The Impact of the Mini-batch Size on the Dynamics of SGD: Variance and Beyond",
+    "authors": [
+      "Xin Qian",
+      "Diego Klabjan"
+    ],
+    "emails": [
+      "~Xin_Qian2",
+      "~Diego_Klabjan1"
+    ],
+    "rank": 2350
+  },
+  {
+    "url": "https://openreview.net/forum?id=YD792AFzt4o",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.41",
+    "confidences": [
+      4,
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "From the statistical learning perspective, complexity control via explicit regularization is a necessity for improving the generalization of over-parameterized models, which deters the memorization of intricate patterns existing only in the training data. However, the impressive generalization performance of over-parameterized neural networks with only implicit regularization challenges the importance of explicit regularization. Furthermore, explicit regularization does not prevent neural networks from memorizing unnatural patterns, such as random labels. In this work, we revisit the role and importance of explicit regularization methods for generalization of the predictive probability, not just the generalization of the 0-1 loss. Specifically, we analyze the possible cause of the poor predictive probability and identify that regularization of predictive confidence is required during training. We then empirically show that explicit regularization significantly improves the reliability of the predictive probability, which enables better predictive uncertainty representation and prevents the overconfidence problem. Our findings present a new direction to improve the predictive probability quality of deterministic neural networks, which can be an efficient and scalable alternative to Bayesian neural networks and ensemble methods.\n",
+    "title": "Revisiting Explicit Regularization in Neural Networks for Reliable Predictive Probability",
+    "authors": [
+      "Taejong Joo",
+      "Uijung Chung"
+    ],
+    "emails": [
+      "~Taejong_Joo1",
+      "~Uijung_Chung1"
+    ],
+    "rank": 2351
+  },
+  {
+    "url": "https://openreview.net/forum?id=dvXFpV6boX",
+    "ratings": [
+      3,
+      5,
+      4,
+      8,
+      2
+    ],
+    "rating": "4.41",
+    "confidences": [
+      4,
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "The state of the art in open-domain question answering (QA) relies on an efficient retriever that drastically reduces the search space for the expensive reader. A rather overlooked question in the community is the relationship between the retriever and the reader, and in particular, if the whole purpose of the retriever is just a fast approximation for the reader. Our empirical evidence indicates that the answer is no, and that the reader and the retriever are complementary to each other even in terms of accuracy only. We make a careful conjecture that the architectural constraint of the retriever, which has been originally intended for enabling approximate search, seems to also make the model more robust in large-scale search.  We then propose to distill the reader into the retriever so that the retriever absorbs the strength of the reader while keeping its own benefit. Experimental results show that our method can enhance the document recall rate as well as the end-to-end QA accuracy of off-the-shelf retrievers in open-domain QA tasks.",
+    "title": "Is Retriever Merely an Approximator of Reader?",
+    "authors": [
+      "Sohee Yang",
+      "Minjoon Seo"
+    ],
+    "emails": [
+      "~Sohee_Yang1",
+      "~Minjoon_Seo1"
+    ],
+    "rank": 2352
+  },
+  {
+    "url": "https://openreview.net/forum?id=8CjVaaSSVxg",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      4,
+      2
+    ],
+    "abstract": "Dealing with multi-agent control in networked systems is one of the biggest challenges in Reinforcement Learning (RL) and limited success has been presented compared to recent deep reinforcement learning in single-agent domain. However, obstacles remain in addressing the delayed global information where each agent learns a decentralized control policy based on local observations and messages from connected neighbors. This paper first considers delayed global information sharing by combining the delayed global information and latent imagination of farsighted states in differentiable communication. Our model allows an agent to imagine its future states and communicate that with its neighbors. The predictive message sent to the connected neighbors reduces the delay in global information.  On the tasks of networked multi-agent traffic control, experimental results show that our model helps stabilize the training of each local agent and outperforms existing algorithms for networked system control.",
+    "title": "Learning Predictive Communication by Imagination in Networked System Control",
+    "authors": [
+      "Yali Du",
+      "Yifan Zhao",
+      "Meng Fang",
+      "Jun Wang",
+      "Gangyan Xu",
+      "Haifeng Zhang"
+    ],
+    "emails": [
+      "stu.hit.edu.cn",
+      "~Meng_Fang1",
+      "ia.ac.cn",
+      "hit.edu.cn",
+      "~Jun_Wang2",
+      "~Yali_Du1"
+    ],
+    "rank": 2353
+  },
+  {
+    "url": "https://openreview.net/forum?id=6jlNy83JUQ_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Theoretical results show that Bayesian methods can achieve lower bounds on regret for online logistic regression.  In practice, however, such techniques may not be feasible especially for very large feature sets.  Various approximations that, for huge sparse feature sets, diminish the theoretical advantages, must be used.  Often, they apply stochastic gradient methods with hyper-parameters that must be tuned on some surrogate loss, defeating theoretical advantages of Bayesian methods.  The surrogate loss, defined to approximate the mixture, requires techniques as Monte Carlo sampling, increasing computations per example.  We propose low complexity analytical approximations for sparse online logistic and probit regressions.  Unlike variational inference and other methods, our methods use analytical closed forms, substantially lowering computations.  Unlike dense solutions, \nas Gaussian Mixtures, our methods allow for sparse problems with huge feature sets without increasing complexity.  With the analytical closed forms, there is also no need for applying stochastic gradient methods on surrogate losses, and for tuning and balancing learning and regularization hyper-parameters.  Empirical results top the performance of the more computationally involved methods.  Like such methods, our methods still reveal per feature and per example uncertainty measures.\n",
+    "title": "Low Complexity Approximate Bayesian Logistic Regression for Sparse Online Learning",
+    "authors": [
+      "Gil I. Shamir",
+      "Wojciech Szpankowski"
+    ],
+    "emails": [
+      "~Wojciech_Szpankowski1",
+      "~Gil_I._Shamir1"
+    ],
+    "rank": 2354
+  },
+  {
+    "url": "https://openreview.net/forum?id=jAJrc-kzVd0",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Reinforcement learning (RL) agents need to learn from past experiences. Prioritized experience replay that weighs experiences by their surprise (the magnitude of the temporal-difference error) significantly improves the learning efficiency for RL algorithms. Intuitively, surprise quantifies the unexpectedness of an experience to the learning agent. But how surprise is related to the importance of experience is not well understood. To address this problem, we derive three value metrics to quantify the importance of experience, which consider the extra reward would be earned by accessing the experience. We theoretically show these value metrics are upper-bounded by surprise for Q-learning. Furthermore, we successfully extend our theoretical framework to maximum-entropy RL by deriving the lower and upper bounds of these value metrics for soft Q-learning, which is also related to surprise. Our framework links two important quantities in RL, i.e., surprise and value of experience, and provides a theoretical basis to estimate the value of experience by surprise. We empirically show that the upper bounds hold in practice, and experience replay using the upper bound as priority improves maximum-entropy RL in Atari games. ",
+    "title": "Revisiting Prioritized Experience Replay: A Value Perspective",
+    "authors": [
+      "Ang A. Li",
+      "Zongqing Lu",
+      "Chenglin Miao"
+    ],
+    "emails": [
+      "~Zongqing_Lu2",
+      "pku.edu.cn"
+    ],
+    "rank": 2355
+  },
+  {
+    "url": "https://openreview.net/forum?id=Tq_H_EDK-wa",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.40",
+    "confidences": [
+      2,
+      2,
+      2,
+      4
+    ],
+    "abstract": "One of the ways that machine learning algorithms can help control the spread of an infectious disease is by building models that predict who is likely to get infected whether or not they display any symptoms, making them good candidates for preemptive isolation. In this work we ask: can we build reliable infection prediction models when the observed data is collected under limited, and biased testing that prioritizes testing symptomatic individuals? Our analysis suggests that under favorable conditions, incomplete testing might be sufficient to achieve relatively good out-of-sample prediction error. Favorable conditions occur when untested-infected individuals have sufficiently different characteristics from untested-healthy, and when the infected individuals are \"potent\", meaning they infect a large majority of their neighbors. We develop an algorithm that predicts infections, and show that it outperforms benchmarks on simulated data. We apply our model to data from a large hospital to predict Clostridioides difficile infections; a communicable disease that is characterized by asymptomatic (i.e., untested) carriers. Using a proxy instead of the unobserved untested-infected state, we show that our model outperforms benchmarks in predicting infections. ",
+    "title": "Exploiting structured data for learning contagious diseases under incomplete testing",
+    "authors": [
+      "Maggie Makar",
+      "Lauren West",
+      "David Hooper",
+      "Eric Horvitz",
+      "Erica Shenoy",
+      "John Guttag"
+    ],
+    "emails": [
+      "~John_Guttag2",
+      "mgh.harvard.edu",
+      "~Eric_Horvitz1",
+      "~Maggie_Makar1"
+    ],
+    "rank": 2356
+  },
+  {
+    "url": "https://openreview.net/forum?id=uRuGNovS11",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      7
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      5,
+      4,
+      2
+    ],
+    "abstract": "Label noise is a natural event of data collection and annotation and has been shown to have significant impact on the performance of deep learning models regarding accuracy reduction and sample complexity increase. This paper aims to develop a novel theoretically sound Bayesian deep metric learning that is robust against noisy labels. \nOur proposed approach is inspired by a linear Bayesian large margin nearest neighbor classification, and is a combination of Bayesian learning, triplet loss-based deep metric learning and variational inference frameworks. We theoretically show the robustness under label noise of our proposed method. The experimental results on benchmark data sets that contain both synthetic and realistic label noise show a considerable improvement in the classification accuracy of our method compared to the linear Bayesian metric learning and the point estimate deep metric learning.",
+    "title": "Bayesian Metric Learning for Robust Training of Deep Models under Noisy Labels",
+    "authors": [
+      "Toan Tran",
+      "Hieu Vu",
+      "Gustavo Carneiro",
+      "Hung Bui"
+    ],
+    "emails": [
+      "vinai.io",
+      "~Toan_Tran1",
+      "~Gustavo_Carneiro1",
+      "~Hung_Bui1"
+    ],
+    "rank": 2357
+  },
+  {
+    "url": "https://openreview.net/forum?id=Yj4mmVB_l6",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.40",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Motivated by the training of Generative Adversarial Networks (GANs), we study methods for solving minimax problems with additional nonsmooth regularizers.\nWe do so by employing \\emph{monotone operator} theory, in particular the \\emph{Forward-Backward-Forward (FBF)} method, which avoids the known issue of limit cycling by correcting each update by a second gradient evaluation.\nFurthermore, we propose a seemingly new scheme which recycles old gradients to mitigate the additional computational cost.\nIn doing so we rediscover a known method, related to \\emph{Optimistic Gradient Descent Ascent (OGDA)}.\nFor both schemes we prove novel convergence rates for convex-concave minimax problems via a unifying approach. The derived error bounds are in terms of the gap function for the ergodic iterates.\nFor the deterministic and the stochastic problem we show a convergence rate of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mtext class=\"mjx-n\" style=\"color: red;\"><mjx-c class=\"mjx-c5C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c63\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c><mjx-c class=\"mjx-c66\"></mjx-c><mjx-c class=\"mjx-c72\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c><mjx-c class=\"mjx-c63\"></mjx-c></mjx-mtext><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mtext mathcolor=\"red\">\\nicefrac</mtext><mrow><mn>1</mn></mrow><mrow><mi>k</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mtext class=\"mjx-n\" style=\"color: red;\"><mjx-c class=\"mjx-c5C\"></mjx-c><mjx-c class=\"mjx-c6E\"></mjx-c><mjx-c class=\"mjx-c69\"></mjx-c><mjx-c class=\"mjx-c63\"></mjx-c><mjx-c class=\"mjx-c65\"></mjx-c><mjx-c class=\"mjx-c66\"></mjx-c><mjx-c class=\"mjx-c72\"></mjx-c><mjx-c class=\"mjx-c61\"></mjx-c><mjx-c class=\"mjx-c63\"></mjx-c></mjx-mtext><mjx-texatom texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-texatom><mjx-texatom texclass=\"ORD\"><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.155em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mtext mathcolor=\"red\">\\nicefrac</mtext><mrow><mn>1</mn></mrow><mrow><msqrt><mi>k</mi></msqrt></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, respectively.\nWe complement our theoretical results with empirical improvements in the training of Wasserstein GANs on the CIFAR10 dataset.",
+    "title": "Two steps at a time --- taking GAN training in stride with Tseng's method",
+    "authors": [
+      "Axel B\u00f6hm",
+      "Michael Sedlmayer",
+      "Ern\u00f6 Robert Csetnek",
+      "Radu Ioan Bot"
+    ],
+    "emails": [
+      "univie.ac.at",
+      "~Axel_B\u00f6hm1",
+      "~Ern\u00f6_Robert_Csetnek1",
+      "~Michael_Sedlmayer1"
+    ],
+    "rank": 2358
+  },
+  {
+    "url": "https://openreview.net/forum?id=2NU7a9AHo-6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "Traditional binary classification models are trained and evaluated with fully labeled data which is not common in real life. In non-ideal dataset, only a small fraction of positive data are labeled. Training a model from such partially labeled data is named as positive-unlabeled (PU) learning. A naive solution of PU learning is treating unlabeled samples as negative. However, using biased data, the trained model may converge to non-optimal point and its real performance cannot be well estimated. Recent works try to recover the unbiased result by estimating the proportion of positive samples with mixture proportion estimation (MPE) algorithms, but the model performance is still limited and heavy computational cost is introduced (particularly for big datasets). In this work, we theoretically prove that Area Under Lift curve (AUL) is an unbiased metric in PU learning scenario, and the experimental evaluation on 9 datasets shows that the average absolute error of AUL estimation is only 1/6 of AUC estimation. By experiments we also find that, compared with state-of-the-art AUC-optimization algorithm, AULoptimization algorithm can not only significantly save the computational cost, but also improve the model performance by up to 10%.",
+    "title": "AUL is a better optimization metric  in PU learning",
+    "authors": [
+      "Shangchuan Huang",
+      "Songtao Wang",
+      "Dan Li",
+      "Liwei Jiang"
+    ],
+    "emails": [
+      "~Shangchuan_Huang1",
+      "tsinghua.edu.cn",
+      "126.com",
+      "~Songtao_Wang1"
+    ],
+    "rank": 2359
+  },
+  {
+    "url": "https://openreview.net/forum?id=gfwfOskyzSx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The approaches that prevent gradient explosion and vanishing have boosted the performance of deep neural networks in recent years. A unique one among them is the self-normalizing neural network (SNN), which is generally more stable than initialization techniques without explicit normalization. The self-normalization property of SNN in previous studies comes from the Scaled Exponential Linear Unit (SELU) activation function. %, which has achieved competitive accuracy on moderate-scale benchmarks.  However, it has been shown that in deeper neural networks, SELU either leads to gradient explosion or loses its self-normalization property. Besides, its accuracy on large-scale benchmarks like ImageNet is less satisfying.  In this paper, we analyze the forward and backward passes of SNN with mean-field theory and block dynamical isometry. A new definition for self-normalization property is proposed that is easier to use both analytically and numerically. A proposition is also proposed which enables us compare the strength of the self-normalization property between different activation functions. We further develop two new activation functions, leaky SELU (lSELU) and scaled SELU (sSELU), that have stronger self-normalization property. The optimal parameters in them can be easily solved with a constrained optimization program. Besides, analysis on the activation's mean in the forward pass reveals that the self-normalization property on mean gets weaker with larger fan-in, which explains the performance degradation on ImageNet. This can be solved with weight centralization, mixup data augmentation, and centralized activation function. On moderate-scale datasets CIFAR-10, CIFAR-100, and Tiny ImageNet, the direct application of lSELU and sSELU achieves up to 2.13% higher accuracy. On Conv MobileNet V1 - ImageNet, sSELU with Mixup, trainable <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D706 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03bb</mi></math></mjx-assistive-mml></mjx-container>, and centralized activation function reaches 71.95% accuracy that is even better than Batch Normalization.(code in Supplementary Material)",
+    "title": "Redefining The Self-Normalization Property",
+    "authors": [
+      "Zhaodong Chen",
+      "Zhao WeiQin",
+      "Lei Deng",
+      "Guoqi Li",
+      "Yuan Xie"
+    ],
+    "emails": [
+      "~Yuan_Xie1",
+      "~Zhao_WeiQin1",
+      "~Zhaodong_Chen1",
+      "~Lei_Deng1",
+      "~Guoqi_Li1"
+    ],
+    "rank": 2360
+  },
+  {
+    "url": "https://openreview.net/forum?id=SnhmiKUPWL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "In many classification tasks, the set of classes can be organized according to a meaningful hierarchy. This structure can be used to assess the severity of confusing each pair of classes, and summarized  under the form of a cost matrix which also defines a finite metric. We propose to integrate this metric in the supervision of a prototypical network in order to model the hierarchical class structure. Our method relies on jointly learning a feature-extracting network and a set of class representations, or prototypes, which incorporate the error metric into their relative arrangement in the embedding space. We show that this simultaneous training allows for consistent improvement of the severity of the network's errors with regard to the class hierarchy when compared to traditional methods and other prototype-based strategies. Furthermore, when the induced metric contains insight on the data structure, our approach improves the overall precision as well. Experiments on four different public datasets\u2014from agricultural time series classification to depth image semantic segmentation\u2014validate our approach.",
+    "title": "Leveraging Class Hierarchies with Metric-Guided Prototype Learning",
+    "authors": [
+      "Vivien Sainte Fare Garnot",
+      "Loic Landrieu"
+    ],
+    "emails": [
+      "~Loic_Landrieu1",
+      "~Vivien_Sainte_Fare_Garnot1"
+    ],
+    "rank": 2361
+  },
+  {
+    "url": "https://openreview.net/forum?id=xEpUl1um6V",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      2,
+      4,
+      5
+    ],
+    "abstract": "With the recent expanding attention of machine learning researchers and practitioners to fairness, there is a void of a common framework to analyze and compare the capabilities of proposed models in deep representation learning. In this paper, we evaluate different fairness methods trained with deep neural networks on a common synthetic dataset to obtain a better insight into the working of these methods. In particular, we train about 2000 different models in various setups, including unbalanced and correlated data configurations, to verify the limits of the current models and better understand in which setups they are subject to failure. In doing so we present a dataset, a large subset of proposed fairness metrics in the literature, and rigorously evaluate recent promising debiasing algorithms in a common framework hoping the research community would take this benchmark as a common entry point for fair deep learning.",
+    "title": "Benchmarking Bias Mitigation Algorithms in Representation Learning through Fairness Metrics",
+    "authors": [
+      "Charan Reddy",
+      "Soroush Mehri",
+      "Deepak Sharma",
+      "Samira Shabanian",
+      "Sina Honari"
+    ],
+    "emails": [
+      "~Charan_Reddy1",
+      "~Sina_Honari1",
+      "~Deepak_Sharma1",
+      "~Samira_Shabanian1",
+      "~Soroush_Mehri1"
+    ],
+    "rank": 2362
+  },
+  {
+    "url": "https://openreview.net/forum?id=M_eaMB2DOxw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Permutation-invariant, -equivariant, and -covariant functions and anti-symmetric functions are important in quantum physics, computer vision, and other disciplines. Applications often require most or all of the following properties: (a) a large class of such functions can be approximated, e.g. all continuous function (b) only the (anti)symmetric functions can be represented (c) a fast algorithm for computing the approximation (d) the representation itself is continuous or differentiable (e) the architecture is suitable for learning the function from data (Anti)symmetric neural networks have recently been developed and applied with great success. A few theoretical approximation results have been proven, but many questions are still open, especially for particles in more than one dimension and the anti-symmetric case, which this work focuses on. More concretely, we derive natural polynomial approximations in the symmetric case, and approximations based on a single generalized Slater determinant in the anti-symmetric case. Unlike some previous super-exponential and discontinuous approximations, these seem a more promising basis for future tighter bounds.",
+    "title": "On Representing (Anti)Symmetric Functions",
+    "authors": [
+      "Marcus Hutter"
+    ],
+    "emails": [
+      "~Marcus_Hutter1"
+    ],
+    "rank": 2363
+  },
+  {
+    "url": "https://openreview.net/forum?id=bQtejwuIqB",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      4,
+      3,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Adversarial examples arise from excessive sensitivity of a model. Commonly studied adversarial examples are malicious inputs, crafted by an adversary from correctly classified examples, to induce misclassification. This paper studies an intriguing, yet far overlooked consequence of the excessive sensitivity, that is, a misclassified example can be easily perturbed to help the model to produce correct output. Such perturbed examples look harmless, but actually can be maliciously utilized by a false friend to make the model self-satisfied. Thus we name them hypocritical examples. With false friends like these, a poorly performed model could behave like a state-of-the-art one. Once a deployer trusts the hypocritical performance and uses the \"well-performed\" model in real-world applications, potential security concerns appear even in benign environments. In this paper, we formalize the hypocritical risk for the first time and propose a defense method specialized for hypocritical examples by minimizing the tradeoff between natural risk and an upper bound of hypocritical risk. Moreover, our theoretical analysis reveals connections between adversarial risk and hypocritical risk. Extensive experiments verify the theoretical results and the effectiveness of our proposed methods.",
+    "title": "With False Friends Like These, Who Can Have Self-Knowledge?",
+    "authors": [
+      "Lue Tao",
+      "Songcan Chen"
+    ],
+    "emails": [
+      "~Songcan_Chen1",
+      "~Lue_Tao1"
+    ],
+    "rank": 2364
+  },
+  {
+    "url": "https://openreview.net/forum?id=paE8yL0aKHo",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.40",
+    "confidences": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "abstract": "The trade-off between exploration and exploitation has long been a crucial issue in reinforcement learning~(RL). Most of the existing RL methods handle this problem by adding action noise to the policies, such as the Soft Actor-Critic (SAC) that introduces an entropy temperature for maximizing both the external value and the entropy of the policy. However, this temperature is applied indiscriminately to all different environment states, undermining the potential of exploration. In this paper, we argue that the agent should explore more in an unfamiliar state, while less in a familiar state, so as to understand the environment more efficiently. To this purpose, we propose \\textbf{C}uriosity-\\textbf{A}ware entropy \\textbf{T}emperature for SAC (CAT-SAC), which utilizes the curiosity mechanism in developing an instance-level entropy temperature. CAT-SAC uses the state prediction error to model curiosity because an unfamiliar state generally has a large prediction error. The curiosity is added to the target entropy to increase the entropy temperature for unfamiliar states and decrease the target entropy for familiar states. By tuning the entropy specifically and adaptively, CAT-SAC is encouraged to explore when its curiosity is large, otherwise, it is encouraged to exploit. Experimental results on the difficult MuJoCo benchmark testify that the proposed CAT-SAC significantly improves the sample efficiency, outperforming the advanced model-based / model-free RL baselines.",
+    "title": "CAT-SAC: Soft Actor-Critic with Curiosity-Aware Entropy Temperature",
+    "authors": [
+      "Junfan Lin",
+      "Changxin Huang",
+      "Xiaodan Liang",
+      "Liang Lin"
+    ],
+    "emails": [
+      "~Liang_Lin1",
+      "~Junfan_Lin1",
+      "~Xiaodan_Liang2",
+      "~Changxin_Huang1"
+    ],
+    "rank": 2365
+  },
+  {
+    "url": "https://openreview.net/forum?id=x2ywTOFM4xt",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.40",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Saliency maps have been widely used to explain the behavior of an image classifier. We introduce a new interpretability method which considers a saliency map as a random variable and aims to calculate the posterior distribution over the saliency map. The likelihood function is designed to measure the distance between the classifier's predictive probability of an image and that of locally perturbed image. For the prior distribution, we make attributions of adjacent pixels have a positive correlation. We use a variational approximation, and show that the approximate posterior is effective in explaining the classifier's behavior. It also has benefits of providing uncertainty over the explanation, giving auxiliary information to experts on how much the explanation is trustworthy.",
+    "title": "Variational saliency maps for explaining model's behavior",
+    "authors": [
+      "Jae Myung Kim",
+      "Eunji Kim",
+      "Seokhyeon Ha",
+      "Sungroh Yoon",
+      "Jungwoo Lee"
+    ],
+    "emails": [
+      "~Jae_Myung_Kim1",
+      "~Sungroh_Yoon1",
+      "~Seokhyeon_Ha1",
+      "~Eunji_Kim2",
+      "~Jungwoo_Lee1"
+    ],
+    "rank": 2366
+  },
+  {
+    "url": "https://openreview.net/forum?id=8W7LTo_zxdE",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      5,
+      5,
+      5
+    ],
+    "rating": "4.40",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Building on recent advances in uncertainty quantification using a single deep deterministic model (DUQ), we introduce variational Deterministic Uncertainty Quantification (vDUQ). We overcome several shortcomings of DUQ by recasting it as a Gaussian process (GP) approximation. Our principled approximation is based on an inducing point GP in combination with Deep Kernel Learning. This enables vDUQ to use rigorous probabilistic foundations, and work not only on classification but also on regression problems. We avoid uncertainty collapse away from the training data by regularizing the spectral norm of the deep feature extractor. Our method matches SotA accuracy, 96.2\\% on CIFAR-10, while maintaining the speed of softmax models, and provides uncertainty estimates competitive with Deep Ensembles. We demonstrate our method in regression problems and by estimating uncertainty in causal inference for personalized medicine",
+    "title": "Variational Deterministic Uncertainty Quantification",
+    "authors": [
+      "Joost van Amersfoort",
+      "Lewis Smith",
+      "Andrew Jesson",
+      "Oscar Key",
+      "Yarin Gal"
+    ],
+    "emails": [
+      "~Andrew_Jesson1",
+      "~Lewis_Smith1",
+      "~Joost_van_Amersfoort1",
+      "~Yarin_Gal1",
+      "~Oscar_Key1"
+    ],
+    "rank": 2367
+  },
+  {
+    "url": "https://openreview.net/forum?id=RcJHy18g1M",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.40",
+    "confidences": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "abstract": "State-of-the-art deep outlier detection methods map data into a latent space with the aim of having  outliers far away from inliers in this space. Unfortunately, this often fails as  the divergence penalty they adopt pushes outliers into the same high-probability regions as inliers. We propose a novel method, OP-DMA, that successfully addresses the above problem. OP-DMA   succeeds in mapping outliers to low probability regions in the latent space by leveraging a novel Prior-Weighted Loss (PWL) that utilizes the insight that outliers are likely to have a higher reconstruction error than inliers. Building on this insight,  OP-DMA weights the reconstruction error of individual points by a multivariate Gaussian probability density function evaluated at each point's latent representation. We demonstrate and provide theoretical proof that this succeeds to map outliers to low-probability regions. Our experimental study shows that OP-DMA  consistently outperforms  state-of-art  methods on a rich variety of outlier detection benchmark datasets.",
+    "title": "Outlier Preserving Distribution Mapping Autoencoders ",
+    "authors": [
+      "Walter Gerych",
+      "Elke Rundensteiner",
+      "Emmanuel Agu"
+    ],
+    "emails": [
+      "~Elke_Rundensteiner2",
+      "~Emmanuel_Agu1",
+      "~Walter_Gerych2"
+    ],
+    "rank": 2368
+  },
+  {
+    "url": "https://openreview.net/forum?id=tEFhwX8s1GN",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.38",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The stochastic gradient descent (SGD) method, first proposed in 1950's, has been the foundation for deep-neural-network (DNN) training with numerous enhancements including adding a momentum or adaptively selecting learning rates, or using both strategies and more.  A common view for SGD is that the learning rate should be eventually made small in order to reach sufficiently good approximate solutions.  Another widely held view is that the vanilla SGD is out of fashion in comparison to many of its modern variations.  In this work, we provide a contrarian claim that, when training over-parameterized DNNs, the vanilla SGD can still compete well with, and oftentimes outperform, its more recent variations by simply using learning rates significantly larger than commonly used values.  We establish theoretical results to explain this local convergence behavior of SGD on nonconvex functions, and also present computational evidence, across multiple tasks including image classification, speech recognition and natural language processing, to support the practice of using larger learning rates.\n",
+    "title": "Training By Vanilla SGD with Larger Learning Rates",
+    "authors": [
+      "Yueyao Yu",
+      "Jie Wang",
+      "Wenye Li",
+      "Yin Zhang"
+    ],
+    "emails": [
+      "~Yin_Zhang4",
+      "~Wenye_Li1",
+      "~Jie_Wang9",
+      "~Yueyao_Yu1"
+    ],
+    "rank": 2369
+  },
+  {
+    "url": "https://openreview.net/forum?id=jNTeYscgSw8",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      3,
+      5
+    ],
+    "rating": "4.38",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "It is common to use the softmax cross-entropy loss to train neural networks on classification datasets where a single class label is assigned to each example. However, it has been shown that modifying softmax cross-entropy with label smoothing or regularizers such as dropout can lead to higher performance. In this paper, we compare a variety of loss functions and output layer regularization strategies that improve performance on image classification tasks. We find differences in the outputs of networks trained with these different objectives, in terms of accuracy, calibration, out-of-distribution robustness, and predictions. However, differences in hidden representations of networks trained with different objectives are restricted to the last few layers; representational similarity reveals no differences among network layers that are not close to the output. We show that all objectives that improve over vanilla softmax loss produce greater class separation in the penultimate layer of the network, which potentially accounts for improved performance on the original task, but results in features that transfer worse to other tasks.",
+    "title": "Demystifying Loss Functions for Classification",
+    "authors": [
+      "Simon Kornblith",
+      "Honglak Lee",
+      "Ting Chen",
+      "Mohammad Norouzi"
+    ],
+    "emails": [
+      "~Mohammad_Norouzi1",
+      "~Ting_Chen1",
+      "~Honglak_Lee2",
+      "~Simon_Kornblith1"
+    ],
+    "rank": 2370
+  },
+  {
+    "url": "https://openreview.net/forum?id=m0ECRXO6QlP",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.38",
+    "confidences": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "abstract": "We investigate a strategy for improving the efficiency of contrastive learning of visual representations by leveraging a small amount of supervised information during pre-training. We propose a semi-supervised loss, SuNCEt, based on noise-contrastive estimation and neighbourhood component analysis, that aims to distinguish examples of different classes in addition to the self-supervised instance-wise pretext tasks. On ImageNet, we find that SuNCEt can be used to match the semi-supervised learning accuracy of previous contrastive approaches while using less than half the amount of pre-training and compute. Our main insight is that leveraging even a small amount of labeled data during pre-training, and not only during fine-tuning, provides an important signal that can significantly accelerate contrastive learning of visual representations.",
+    "title": "Supervision Accelerates Pre-training in Contrastive Semi-Supervised Learning of Visual Representations",
+    "authors": [
+      "Mido Assran",
+      "Nicolas Ballas",
+      "Lluis Castrejon",
+      "Michael Rabbat"
+    ],
+    "emails": [
+      "~Lluis_Castrejon1",
+      "~Michael_Rabbat1",
+      "~Mido_Assran1",
+      "~Nicolas_Ballas1"
+    ],
+    "rank": 2371
+  },
+  {
+    "url": "https://openreview.net/forum?id=jsM6yvqiT0W",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      2,
+      7,
+      5
+    ],
+    "rating": "4.38",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Modern machine learning models with high accuracy often exhibit poor uncertainty calibration: the output probabilities of the model do not reflect its accuracy, and tend to be over-confident. Existing post-calibration methods such as temperature scaling recalibrate a trained model using rather simple calibrators with one or few parameters, which can have a rather limited capacity. In this paper, we propose Neural Rank Preserving Transforms (NRPT), a new post-calibration method that adjusts the output probabilities of a trained classifier using a calibrator of higher capacity, while maintaining its prediction accuracy. NRPT learns a calibrator that preserves the rank of the probabilities through general monotonic transforms, individualizes to the original input, and allows learning with any loss function that encourages calibration. We show experimentally that NRPT improves the expected calibration error (ECE) significantly over existing post-calibration methods such as (local) temperature scaling on large-scale image and text classification tasks. The performance of NRPT can further match ensemble methods such as deep ensembles, while being much more parameter-efficient. We further demonstrate the improved calibration ability of NRPT beyond the ECE metric, such as accuracy among top-confidence predictions, as well as optimizing the tradeoff between calibration and sharpness.",
+    "title": "Improved Uncertainty Post-Calibration via Rank Preserving Transforms",
+    "authors": [
+      "Yu Bai",
+      "Tengyu Ma",
+      "Huan Wang",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Yu_Bai1",
+      "~Caiming_Xiong1",
+      "~Huan_Wang1",
+      "~Tengyu_Ma1"
+    ],
+    "rank": 2372
+  },
+  {
+    "url": "https://openreview.net/forum?id=WkKsWwxnAkt",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5
+    ],
+    "rating": "4.38",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Subspace clustering (SC) approaches based on the self-representation model achieved encouraging performance when compared with the clustering algorithms that rely on proximity measures between data points. However, they still face serious limitations in real-world applications. One limitation relates to the linearity assumption of the self-representation model. The reason is that, usually, samples lie in non-linear manifolds, e.g. face images acquired under non-uniform illumination and different poses. Another limitation relates to errors that can be random or sample-specific (outliers), whereas in real-world applications their origin is mostly unknown. Furthermore, the majority of existing algorithms use external clustering validation methods to measure clustering quality, and that requires access to ground-truth (GT) labels. Hence, it is practically important to develop a deep SC method that jointly learns self-expressive (SE) feature representation and handles data corruptions of unknown origin, and estimates clustering error using the internal clustering validation method, i.e. without access to the GT. Mostly, the base of the recently developed deep SC networks is convolutional autoencoder. It is an end-to-end fully convolutional network that is based on the minimization of the reconstruction error. Together, the autoencoder and an additional SE module are forming a Deep SC network (DSCNet). Hence, the total loss function of DSCNet is composed of reconstruction loss and SE loss. That is, during the learning process, the quality of clustering is not taken into account. Self-supervised convolutional SC network (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN) addressed this issue through the addition of a fully connected layer (FC) module and a spectral clustering module that, respectively, generate soft- and pseudo-labels.\n\nWhile inheriting the architecture of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN, this paper proposes a robust SE loss and an early self-stopping criterion for training. Robustness to arbitrary (unknown) types of data corruptions is achieved by using the correntropy induced metric (CIM) of the error of the SE model. By mapping the input data space to a reproducible kernel Hilbert space (RKHS), correntropy defines an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> distance in RKHS and creates nonlinear distance measure in the original input data space. Hence, correntropy is the optimal measure for error with the arbitrary non-Gaussian distribution as opposed to the MSE that implies Gaussian distribution of errors. Thus, because it is estimated directly from data, CIM can handle data corruption of unknown origin. As opposed to the DSCNet and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN, the self-stopping criterion of the proposed algorithm is achieved by reaching a plateau in the change of loss value.\n\nAlthough the architecture of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN contains the FC module, which is capable of handling out-of-sample data by using a softmax classifier, its performance has not been tested on unseen data. The importance of the FC module is, however, emphasized through self-supervision. In a truly unsupervised setting, pseudo-labels generated from the spectral clustering module guide the learning process through the induced classification error at the end of the FC module. They also enhance the goodness of the self-representation matrix through penalizing incorrectly connected elements. Adding these two loss functions to the total loss makes pseudo-labels an especially important component in label-constrained applications. Such dual self-supervision, in theory, enables deep networks to learn representation from available unlabeled data and to use the small number of labeled data to validate the trained model.\n\nIn addition to the main contributions of the paper, this study has three side-contributions. It aimed to set up a more transparent way for the proper performance estimation of deep SC models and to integrate block-diagonal regularization into the gradient descent learning process. The majority of SC studies optimize hyperparameters using external clustering validation methodology, whereas the same labels are used for tuning hyperparameters and evaluating the final clustering performance. Furthermore, models like <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN suffer from an early commitment problem as they depend on weight-transfer from pretrained models that have \u201cseen\u201d the GT already (e.g. DSCNet transferred to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN). From the machine learning perspective, data leakage and hyperparameter tuning based on the test GT are unacceptable. Measured clustering performances of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN and DSCNet on the out-of-sample (unseen) data, thus, differ significantly from the optimistic one presented in the original papers. Also, post-processing of self-representation matrix is reduced to a significant extent.  Robust <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>S</mi><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container>ConvSCN outperforms its baseline version by a significant amount for both seen and unseen data on four well-known datasets. To the best of our knowledge, such an ablation study on unseen data has not been reported previously in SC studies.",
+    "title": "Subspace Clustering via Robust Self-Supervised Convolutional Neural Network",
+    "authors": [
+      "Dario Sitnik",
+      "Ivica Kopriva"
+    ],
+    "emails": [
+      "~Dario_Sitnik1",
+      "irb.hr"
+    ],
+    "rank": 2373
+  },
+  {
+    "url": "https://openreview.net/forum?id=6SXNhWc5HFe",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.38",
+    "confidences": [
+      5,
+      5,
+      3,
+      3
+    ],
+    "abstract": "We propose a reinforcement learning algorithm for stationary mean-field games, where the goal is to learn a pair of mean-field state and stationary policy that constitutes the Nash equilibrium. When viewing the mean-field state and the policy as two players, we propose a fictitious play algorithm which alternatively updates the mean-field state and the policy via gradient-descent and proximal policy optimization, respectively. Our algorithm is in stark contrast with previous literature which solves each single-agent reinforcement learning problem induced by the iterates mean-field states to the optimum. Furthermore, we prove that our fictitious play algorithm converges to the Nash equilibrium at a sublinear rate. To the best of our knowledge, this seems the first provably convergent reinforcement learning algorithm for mean-field games based on iterative updates of both mean-field state and policy.",
+    "title": "Provable Fictitious Play for General Mean-Field Games",
+    "authors": [
+      "Qiaomin Xie",
+      "Zhuoran Yang",
+      "Zhaoran Wang",
+      "Andreea Minca"
+    ],
+    "emails": [
+      "~Qiaomin_Xie1",
+      "~Andreea_Minca1",
+      "~Zhuoran_Yang1",
+      "~Zhaoran_Wang1"
+    ],
+    "rank": 2374
+  },
+  {
+    "url": "https://openreview.net/forum?id=qUs18ed9oe",
+    "ratings": [
+      4,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.38",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Deploying Reinforcement Learning (RL) agents in the real-world require that the agents satisfy safety constraints. Current RL agents explore the environment without considering these constraints, which can lead to damage to the hardware or even other agents in the environment. We propose a new method, LBPO, that uses a Lyapunov-based barrier function to restrict the policy update to a safe set for each training iteration. Our method also allows the user to control the conservativeness of the agent with respect to the constraints in the environment. LBPO significantly outperforms state-of-the-art baselines in terms of the number of constraint violations during training while being competitive in terms of performance.  Further, our analysis reveals that baselines like CPO and SDDPG rely mostly on backtracking to ensure safety rather than safe projection, which provides insight into why previous methods might not have effectively limit the number of constraint violations.",
+    "title": "Lyapunov Barrier Policy Optimization",
+    "authors": [
+      "Harshit Sikchi",
+      "Wenxuan Zhou",
+      "David Held"
+    ],
+    "emails": [
+      "~Harshit_Sikchi1",
+      "~Wenxuan_Zhou1",
+      "~David_Held1"
+    ],
+    "rank": 2375
+  },
+  {
+    "url": "https://openreview.net/forum?id=XCgFz7-l0QG",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.38",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Knowledge distillation (KD) is one of the prominent techniques for model compression. In this method, the knowledge\nof a large network (teacher) is distilled into a model (student) with usually significantly fewer parameters. KD tries to\nbetter-match the output of the student model to that of the teacher model based on the knowledge extracts from\nthe forward pass of the teacher network. Although conventional KD is effective for matching the two networks over the given\ndata points, there is no guarantee that these models would match in other areas for which we do not have enough\ntraining samples. In this work, we address that problem by generating new auxiliary training samples based on\nextracting knowledge from the backward pass of the teacher in the areas where the student diverges greatly from the\nteacher. We compute the difference between the teacher and the student and generate new data samples that\nmaximize the divergence. This is done by perturbing data samples in the direction of the gradient of the difference\nbetween the student and the teacher. Augmenting the training set by adding this auxiliary improves the performance of\nKD significantly and leads to a closer match between the student and the teacher. Using this approach, when data samples\ncome from a discrete domain, such as applications of natural language processing (NLP) and language understanding,\nis not trivial. However, we show how this technique can be used successfully in such applications. We studied the effect\nof the proposed method on various tasks in different domains, including images and NLP tasks with considerably\nsmaller student networks. The results of our experiments, when compared with the original KD, show 4% improvement on MNIST with a student network that is 160 times smaller, 1% improvement on a CIFAR-10 dataset with\na student that is 9 times smaller, and an average 1.5% improvement on the GLUE benchmark with a distilroBERTa-base\nstudent.",
+    "title": "Improved knowledge distillation by utilizing backward pass knowledge in neural networks",
+    "authors": [
+      "Aref Jafari",
+      "Mehdi Rezagholizadeh",
+      "Ali Ghodsi"
+    ],
+    "emails": [
+      "~Aref_Jafari1",
+      "~Mehdi_Rezagholizadeh1",
+      "~Ali_Ghodsi1"
+    ],
+    "rank": 2376
+  },
+  {
+    "url": "https://openreview.net/forum?id=PpOtGYNVT6A",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      7
+    ],
+    "rating": "4.38",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "Strong progress has been achieved in semi-supervised learning (SSL) by combining several methods, some of which relate to properties of the data distribution p(x), others to the model outputs p(y|x), e.g. minimising the entropy of unlabelled predictions. Focusing on the latter, we fill a gap in the standard text by introducing a probabilistic model for discriminative semi-supervised learning, mirroring the classical generative model. Several SSL methods are theoretically explained by our model as inducing (approximate) strong priors over parameters of p(y|x). Applying this same probabilistic model to tasks in which labels represent binary attributes, we theoretically justify a family of neuro-symbolic SSL approaches, taking a step towards bridging the divide between statistical learning and logical reasoning.",
+    "title": "A Probabilistic Model for Discriminative and Neuro-Symbolic Semi-Supervised Learning",
+    "authors": [
+      "Carl Allen",
+      "Ivana Balazevic",
+      "Timothy Hospedales"
+    ],
+    "emails": [
+      "~Ivana_Balazevic1",
+      "~Carl_Allen1",
+      "~Timothy_Hospedales1"
+    ],
+    "rank": 2377
+  },
+  {
+    "url": "https://openreview.net/forum?id=k2TyMLwuikx",
+    "ratings": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.38",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "We propose a novel method for compressing Deep Neural Networks (DNNs) with competitive performance to state-of-the-art methods. We measure a new model-free information between the feature maps and the output of the network. Model-freeness of our information measure guarantees that no parametric assumptions on the feature distribution are required. The new model-free information is subsequently used to prune a collection of redundant layers in the networks with skip-connections. Numerical experiments on CIFAR-10/100, SVHN, Tiny ImageNet, and ImageNet data sets show the efficacy of the proposed approach in compressing deep models. For instance, in classifying CIFAR-10 images our method achieves respectively 64.50% and 60.31% reduction in the number of parameters and FLOPs for a full DenseNet model with 0.77 million parameters while dropping only 1% in the test accuracy. Our code is available at <a href=\"https://github.com/suuyawu/PEDmodelcompression\" target=\"_blank\" rel=\"nofollow\">https://github.com/suuyawu/PEDmodelcompression</a>",
+    "title": "Model-Free Energy Distance for Pruning DNNs",
+    "authors": [
+      "Mohammadreza Soltani",
+      "Suya Wu",
+      "Yuerong Li",
+      "Jie Ding",
+      "Vahid Tarokh"
+    ],
+    "emails": [
+      "~Mohammadreza_Soltani1",
+      "~Suya_Wu1",
+      "duke.edu",
+      "~Vahid_Tarokh1",
+      "~Jie_Ding2"
+    ],
+    "rank": 2378
+  },
+  {
+    "url": "https://openreview.net/forum?id=RkqYJw5TMD7",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.38",
+    "confidences": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "abstract": "This paper studies test-time adaptation in the context of adversarial robustness. We formulate an adversarial threat model for test-time adaptation, where the defender may have a unique advantage as the adversarial game becomes a maximin game, instead of a minimax game as in the classic adversarial robustness threat model. We then study whether the maximin threat model admits more ``good solutions'' than the minimax threat model, and is thus \\emph{strictly weaker}. For this purpose, we first present a provable separation between the two threat models in a natural Gaussian data model. For deep learning, while we do not have a proof, we propose a candidate, Domain Adversarial Neural Networks (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5A3 TEX-SS\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5A0 TEX-SS\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5AD TEX-SS\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5AD TEX-SS\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"sans-serif\">D</mi><mi mathvariant=\"sans-serif\">A</mi><mi mathvariant=\"sans-serif\">N</mi><mi mathvariant=\"sans-serif\">N</mi></mrow></math></mjx-assistive-mml></mjx-container>), an algorithm designed for unsupervised domain adaptation, by showing that it provides nontrivial robustness in the test-time maximin threat model against strong transfer attacks and adaptive attacks. This is somewhat surprising since <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5A3 TEX-SS\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5A0 TEX-SS\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5AD TEX-SS\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-ss\"><mjx-c class=\"mjx-c1D5AD TEX-SS\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"sans-serif\">D</mi><mi mathvariant=\"sans-serif\">A</mi><mi mathvariant=\"sans-serif\">N</mi><mi mathvariant=\"sans-serif\">N</mi></mrow></math></mjx-assistive-mml></mjx-container> is not designed specifically for adversarial robustness (e.g., against norm-based attacks), and provides no robustness in the minimax model. Complementing these results, we show that recent data-oblivious test-time adaptations can be easily attacked even with simple transfer attacks. We conclude the paper with various future directions of studying adversarially robust test-time adaptation.",
+    "title": "Test-Time Adaptation and Adversarial Robustness",
+    "authors": [
+      "Xi Wu",
+      "Yang Guo",
+      "Tianqi Li",
+      "Jiefeng Chen",
+      "Qicheng Lao",
+      "Yingyu Liang",
+      "Somesh Jha"
+    ],
+    "emails": [
+      "~Tianqi_Li2",
+      "~Jiefeng_Chen2",
+      "~Xi_Wu1",
+      "~Yang_Guo4",
+      "~Yingyu_Liang1",
+      "~Somesh_Jha1",
+      "~Qicheng_Lao2"
+    ],
+    "rank": 2379
+  },
+  {
+    "url": "https://openreview.net/forum?id=2wjKRmraNan",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      6,
+      5,
+      5
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "The need of Feature Compatible Learning (FCL) arises from many large scale retrieval-based applications, where updating the entire library of embedding vectors is expensive. When an upgraded embedding model shows potential, it is desired to transform the benefit of the new model without refreshing the library. While progresses have been made along this new direction, existing approaches for feature compatible learning mostly rely on old training data and classifiers, which are not available in many industry settings. In this work, we introduce an approach for feature compatible learning without inheriting old classifier and training data, i.e., Non-Inherent Feature Compatible Learning. Our approach requires only features extracted by \\emph{old} model's backbone and \\emph{new} training data, and makes no assumption about the overlap between old and new training data. We propose a unified framework for FCL, and extend it to handle the case where the old model is a black-box.  Specifically, we learn a simple pseudo classifier in lieu of the old model, and further enhance it with a random walk algorithm.  As a result, the embedding features produced by the new model can be matched with those from the old model without sacrificing performance. Experiments on ImageNet ILSVRC 2012 and Places365 data proved the efficacy of the proposed approach.",
+    "title": "Non-Inherent Feature Compatible Learning",
+    "authors": [
+      "Yantao Shen",
+      "Fanzi Wu",
+      "Ying Shan"
+    ],
+    "emails": [
+      "~Ying_Shan2",
+      "~Fanzi_Wu1",
+      "~Yantao_Shen2"
+    ],
+    "rank": 2380
+  },
+  {
+    "url": "https://openreview.net/forum?id=YtgKRmhAojv",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.36",
+    "confidences": [
+      2,
+      2,
+      2,
+      5
+    ],
+    "abstract": "Orthogonal weight matrices are used in many areas of deep learning. Much previous work attempt to alleviate the additional computational resources it requires to constrain weight matrices to be orthogonal. One popular approach utilizes *many* Householder reflections. The only practical drawback is that many reflections cause low GPU utilization. We mitigate this final drawback by proving that *one* reflection is sufficient, if the reflection is computed by an auxiliary neural network.",
+    "title": "One Reflection Suffice",
+    "authors": [
+      "Alexander Mathiasen",
+      "Frederik Hvilsh\u00f8j"
+    ],
+    "emails": [
+      "~Alexander_Mathiasen2",
+      "gmail.com"
+    ],
+    "rank": 2381
+  },
+  {
+    "url": "https://openreview.net/forum?id=tzfpltOnsJZ",
+    "ratings": [
+      6,
+      3,
+      4
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Photorealistic image generation is progressing rapidly and has reached a new level of quality, thanks to the invention and breakthroughs of generative adversarial networks (GANs). Yet the dark side of such <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">deepfakes</mtext></math></mjx-assistive-mml></mjx-container>, the malicious use of generated media, never stops raising concerns of visual misinformation. Existing research works on deepfake detection demonstrate impressive accuracy, while it is accompanied by adversarial iterations on detection countermeasure techniques. In order to lead this arms race to the end, we investigate a fundamental solution on deepfake detection, agnostic to the evolution of GANs in order to enable a responsible disclosure or regulation of such double-edged techniques. We propose to embed <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41C TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D425 TEX-B\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D41F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D420 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D429 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42C TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">artificial fingerprints</mtext></math></mjx-assistive-mml></mjx-container> into GAN training data, and show a surprising discovery on the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D42D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42C TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41A TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D425 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42D TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D432 TEX-B\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"bold\">transferability</mtext></math></mjx-assistive-mml></mjx-container> of such fingerprints from training data to GAN models, which in turn enables reliable detection and attribution of deepfakes. Our empirical study shows that our fingerprinting technique (1) holds for different state-of-the-art GAN configurations, (2) turns more effective along with the development of GAN techniques, (3) has a negligible side effect on the generation quality, and (4) stays robust against image-level and model-level perturbations. When we allocate each GAN publisher a unique artificial fingerprint, the margins between real data and deepfakes, and the margins among different deepfake sources are fundamentally guaranteed. As a result, we are able to evidence accurate deepfake detection/attribution using our fingerprint decoder, which makes this solution stand out from the current arms race.",
+    "title": "Artificial GAN Fingerprints: Rooting Deepfake Attribution in Training Data",
+    "authors": [
+      "Ning Yu",
+      "Vladislav Skripniuk",
+      "Sahar Abdelnabi",
+      "Mario Fritz"
+    ],
+    "emails": [
+      "~Mario_Fritz1",
+      "~Sahar_Abdelnabi1",
+      "mpi-inf.mpg.de",
+      "~Ning_Yu2"
+    ],
+    "rank": 2382
+  },
+  {
+    "url": "https://openreview.net/forum?id=Uqu9yHvqlRf",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      3
+    ],
+    "rating": "4.36",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "The emergence of language is a mystery. One dominant theory is that cooperation boosts language to emerge. However, as a means of giving out information, language seems not to be an evolutionarily stable strategy. To ensure the survival advantage of many competitors, animals are selfish in nature. From the perspective of Darwinian, if an individual can obtain a higher benefit by deceiving the other party, why not deceive? For those who are cheated, once bitten and twice shy, cooperation will no longer be a good option. As a result, motivation for communication, as well as the emergence of language would perish. Then, what preserves the emergence of language? We aim to answer this question in a brand new framework of agent community, reinforcement learning, and natural selection. Empirically, we reveal that lying indeed dispels cooperation. Even with individual resistance to lying behaviors, liars can easily defeat truth tellers and survive during natural selection. However, social resistance eventually constrains lying and makes the emergence of language possible.",
+    "title": "What Preserves the Emergence of Language?",
+    "authors": [
+      "Ziluo Ding",
+      "Tiejun Huang",
+      "Zongqing Lu"
+    ],
+    "emails": [
+      "~Zongqing_Lu2",
+      "~Tiejun_Huang1",
+      "~Ziluo_Ding1"
+    ],
+    "rank": 2383
+  },
+  {
+    "url": "https://openreview.net/forum?id=Dh29CAlnMW",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6
+    ],
+    "rating": "4.36",
+    "confidences": [
+      5,
+      4,
+      2
+    ],
+    "abstract": "The Automunge open source python library platform for tabular data pre-processing automates feature engineering data transformations of numerical encoding and missing data infill to received tidy data on bases fit to properties of columns in a designated train set for consistent and efficient application to subsequent data pipelines such as for inference, where transformations may be applied to distinct columns in \u201cfamily tree\u201d sets with generations and branches of derivations. Included in the library of transformations are methods to extract structure from bounded categorical string sets by way of automated string parsing, in which comparisons between entries in the set of unique values are parsed to identify character subset overlaps which may be encoded by appended columns of boolean overlap detection activations or by replacing string entries with identified overlap partitions. Further string parsing options, which may also be applied to unbounded categoric sets, include extraction of numeric substring partitions from entries or search functions to identify presence of specified substring partitions. The aggregation of these methods into \u201cfamily tree\u201d sets of transformations are demonstrated for use to automatically extract structure from categoric string compositions in relation to the set of entries in a column, such as may be applied to prepare categoric string set encodings for machine learning without human intervention.",
+    "title": "Parsed Categoric Encodings with Automunge",
+    "authors": [
+      "Nicholas Teague"
+    ],
+    "emails": [
+      "~Nicholas_Teague1"
+    ],
+    "rank": 2384
+  },
+  {
+    "url": "https://openreview.net/forum?id=HWX5j6Bv_ih",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      6,
+      5
+    ],
+    "rating": "4.36",
+    "confidences": [
+      1,
+      5,
+      2,
+      3
+    ],
+    "abstract": "Vast amount of data generated from networks of sensors, wearables, and the Internet of Things (IoT) devices underscores the need for advanced modeling techniques that leverage the spatio-temporal structure of decentralized data due to the need for edge computation and licensing (data access) issues. While federated learning (FL) has emerged as a framework for model training without requiring direct data sharing and exchange, effectively modeling the complex spatio-temporal dependencies to improve forecasting capabilities still remains an open problem. On the other hand, state-of-the-art spatio-temporal forecasting models assume unfettered access to the data, neglecting constraints on data sharing. To bridge this gap, we propose a federated spatio-temporal model -- Cross-Node Federated Graph Neural Network (CNFGNN) -- which explicitly encodes the underlying graph structure using graph neural network (GNN)-based architecture under the constraint of cross-node federated learning, which requires that data in a network of nodes is generated locally on each node and remains decentralized. CNFGNN operates by disentangling the temporal dynamics modeling on devices and spatial dynamics on the server, utilizing alternating optimization to reduce the communication cost, facilitating computations on the edge devices. Experiments on the traffic flow forecasting task show that CNFGNN achieves the best forecasting performance in both transductive and inductive learning settings with no extra computation cost on edge devices, while incurring modest communication cost.",
+    "title": "Cross-Node Federated Graph Neural Network for Spatio-Temporal Data Modeling",
+    "authors": [
+      "Chuizheng Meng",
+      "Sirisha Rambhatla",
+      "Yan Liu"
+    ],
+    "emails": [
+      "~Chuizheng_Meng1",
+      "~Sirisha_Rambhatla1",
+      "~Yan_Liu1"
+    ],
+    "rank": 2385
+  },
+  {
+    "url": "https://openreview.net/forum?id=_EQxgdRFUHG",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we consider problems where multiple predictions can be considered correct, but only one of them is given as supervision. This setting differs from both the regression and class-conditional generative modelling settings: in the former, there is a unique ground truth for each input, which is provided as supervision; in the latter, there are many ground truths for each input, and many are provided as supervision. Applying either regression methods and conditional generative models to the setting considered in this paper often results in a model that can only make a single prediction for each input. We explore several problems that have this property, which naturally arise in image processing, and develop an approach that can generate multiple high-quality predictions given the same input. As a result, it can be used to generate high-quality outputs that are different from the observed ground truth. ",
+    "title": "Generating Unobserved Alternatives: A Case Study through Super-Resolution and Decompression",
+    "authors": [
+      "Shichong Peng",
+      "Ke Li"
+    ],
+    "emails": [
+      "~Shichong_Peng1",
+      "~Ke_Li1"
+    ],
+    "rank": 2386
+  },
+  {
+    "url": "https://openreview.net/forum?id=-DRft_lKDqo",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      2,
+      5,
+      3
+    ],
+    "abstract": "To certify safety and robustness of neural networks, researchers have successfully applied abstract interpretation, primarily using interval bound propagation. To understand the power of interval bounds, we present the abstract universal approximation (AUA) theorem, a generalization of the recent result by Baader et al. (2020) for ReLU networks to a large class of neural networks. The AUA theorem states that for any continuous function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>, there exists a neural network that (1) approximates <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container> (universal approximation) and (2) whose interval bounds are an arbitrarily close approximation of the set semantics of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>f</mi></math></mjx-assistive-mml></mjx-container>. The network may be constructed using any activation function from a rich class of functions---sigmoid, tanh, ReLU, ELU, etc.---making our result quite general. The key implication of the AUA theorem is that there always exists certifiably robust neural networks, which can be constructed using a wide range of activation functions.",
+    "title": "Generalized Universal Approximation for Certified Networks",
+    "authors": [
+      "Zi Wang",
+      "Aws Albarghouthi",
+      "Somesh Jha"
+    ],
+    "emails": [
+      "~Somesh_Jha1",
+      "~Zi_Wang3",
+      "~Aws_Albarghouthi1"
+    ],
+    "rank": 2387
+  },
+  {
+    "url": "https://openreview.net/forum?id=kqDCPX7eWS",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Distributed variants of stochastic gradient descent (SGD) are central to training deep neural networks on massive datasets. \nSeveral scalable versions of data-parallel  SGD have been developed, leveraging asynchrony, communication-compression, and local gradient steps. Current research seeks a balance between distributed scalability--seeking to minimize the amount of synchronization needed--and generalization performance--seeking to achieve the same or better accuracy relative to the sequential baseline. However, a key issue in this regime is largely unaddressed: if ``local\" data-parallelism is aggressively applied to better utilize the computing resources available with workers, generalization performance of the trained model degrades. \n\nIn this paper, we present a method to improve the \"local scalability\" of decentralized SGD. In particular, we propose two key techniques: (a) shared-memory based asynchronous gradient updates at decentralized workers keeping the local minibatch size small, and (b) an asynchronous non-blocking in-place averaging overlapping the local updates, thus essentially utilizing all compute resources at all times without the need for large minibatches. Empirically, the additional noise introduced in the procedure proves to be a boon for better generalization. On the theoretical side, we show that this method guarantees ergodic convergence for non-convex objectives, and achieves the classic sublinear rate under standard assumptions. \nOn the practical side, we show that it improves upon the performance of local SGD and related schemes, without compromising accuracy. ",
+    "title": "Local SGD Meets Asynchrony",
+    "authors": [
+      "Bapi Chatterjee",
+      "Vyacheslav Kungurtsev",
+      "Dan Alistarh"
+    ],
+    "emails": [
+      "~Vyacheslav_Kungurtsev1",
+      "~Dan_Alistarh7",
+      "~Bapi_Chatterjee1"
+    ],
+    "rank": 2388
+  },
+  {
+    "url": "https://openreview.net/forum?id=_adSMszz_g9",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      6
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Transformer models have obtained remarkable accomplishments in various NLP tasks. However, these models have efficiency issues on long sequences, as the complexity of their self-attention module scales quadratically with the sequence length. To remedy the limitation, we present Memformer, a novel language model that utilizes a single unified memory to encode and retrieve past information. It includes a new optimization scheme, Memory Replay Back-Propagation, which promotes long-range back-propagation through time with a significantly reduced memory requirement. Memformer achieves <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> time complexity and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c4F TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">O</mi></mrow><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> space complexity in processing long sequences, meaning that the model can handle an infinite length sequence during inference. Our model is also compatible with other self-supervised tasks to further improve the performance on language modeling. Experimental results show that Memformer outperforms the previous long-range sequence models on WikiText-103, including Transformer-XL and Compressive Transformer.",
+    "title": "Memformer: The Memory-Augmented Transformer",
+    "authors": [
+      "Qingyang Wu",
+      "Zhenzhong Lan",
+      "Jing Gu",
+      "Zhou Yu"
+    ],
+    "emails": [
+      "~Qingyang_Wu1",
+      "westlake.edu.cn",
+      "~Zhou_Yu1",
+      "~Jing_Gu2"
+    ],
+    "rank": 2389
+  },
+  {
+    "url": "https://openreview.net/forum?id=WdOCkf4aCM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.36",
+    "confidences": [
+      3,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Deep Reinforcement Learning (DRL) has recently achieved significant advances in various domains. However, explaining the policy of RL agents still remains an open problem due to several factors, one being the complexity of explaining neural networks decisions. Recently, a group of works have used decision-tree-based models to learn explainable policies. Soft decision trees (SDTs) and discretized differentiable decision trees (DDTs) have been demonstrated to achieve both good performance and share the benefit of having explainable policies. In this work, we further improve the results for tree-based explainable RL in both performance and explainability. Our proposal, Cascading Decision Trees (CDTs) apply representation learning on the decision path to allow richer expressivity. Empirical results show that in both situations, where CDTs are used as policy function approximators or as imitation learners to explain black-box policies, CDTs can achieve better performances with more succinct and explainable models than SDTs. As a second contribution our study reveals limitations of explaining black-box policies via imitation learning with tree-based explainable models, due to its inherent instability.",
+    "title": "CDT: Cascading Decision Trees for Explainable Reinforcement Learning",
+    "authors": [
+      "Zihan Ding",
+      "Pablo Hernandez-Leal",
+      "Gavin Weiguang Ding",
+      "Changjian Li",
+      "Ruitong Huang"
+    ],
+    "emails": [
+      "~Changjian_Li1",
+      "~Ruitong_Huang1",
+      "~Pablo_Hernandez-Leal2",
+      "~Zihan_Ding1",
+      "~Gavin_Weiguang_Ding1"
+    ],
+    "rank": 2390
+  },
+  {
+    "url": "https://openreview.net/forum?id=PrvaKdJcKhX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.36",
+    "confidences": [
+      4,
+      2,
+      4,
+      1
+    ],
+    "abstract": "Resource allocation for coverage of physical spaces is a challenging problem in robotic surveillance, mobile sensor networks and security domains. Recent gradient-based optimization approaches to this problem estimate utilities of actions by using neural networks to learn a differentiable approximation to spatial coverage objectives. In this work, we empirically show that spatial coverage objectives with multiple-resources are combinatorially hard to approximate for neural networks and lead to sub-optimal policies. As our major contribution, we propose a tractable framework to approximate a general class of spatial coverage objectives and their gradients using a combination of Newton-Leibniz theorem, spatial discretization and implicit boundary differentiation. We empirically demonstrate the efficacy of our proposed framework on single and multi-agent spatial coverage problems.",
+    "title": "Differentiable Approximations for Multi-resource Spatial Coverage Problems",
+    "authors": [
+      "Nitin Kamra",
+      "Yan Liu"
+    ],
+    "emails": [
+      "~Yan_Liu1",
+      "~Nitin_Kamra1"
+    ],
+    "rank": 2391
+  },
+  {
+    "url": "https://openreview.net/forum?id=4qgEGwOtxU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5
+    ],
+    "rating": "4.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "As deep neural networks become more advanced and widely-used, it is important to understand their inner workings. Toward this goal, modular interpretations are appealing because they offer flexible levels of abstraction aside from standard architectural building blocks (e.g., neurons, channels, layers). In this paper, we consider the problem of assessing how functionally interpretable a given partitioning of neurons is. We propose two proxies for this: importance which reflects how crucial sets of neurons are to network performance, and coherence which reflects how consistently their neurons associate with input/output features. To measure these proxies, we develop a set of statistical methods based on techniques that have conventionally been used for the interpretation of individual neurons. We apply these methods on partitionings generated by a spectral clustering algorithm which uses a graph representation of the network's neurons and weights. We show that despite our partitioning algorithm using neither activations nor gradients, it reveals clusters with a surprising amount of importance and coherence. Together, these results support the use of modular interpretations, and graph-based partitionings in particular, for interpretability.",
+    "title": "Importance and Coherence: Methods for Evaluating Modularity in Neural Networks",
+    "authors": [
+      "Shlomi Hod",
+      "Stephen Casper",
+      "Daniel Filan",
+      "Cody Wild",
+      "Andrew Critch",
+      "Stuart Russell"
+    ],
+    "emails": [
+      "~Shlomi_Hod1",
+      "~Daniel_Filan1",
+      "~Cody_Wild1",
+      "~Stuart_Russell1",
+      "~Andrew_Critch1",
+      "~Stephen_Casper1"
+    ],
+    "rank": 2392
+  },
+  {
+    "url": "https://openreview.net/forum?id=9MdLwggYa02",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.35",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In most pragmatic settings, data augmentation and regularization are essential, and require hyperparameter search.\nPopulation based training (PBT) is an effective tool for efficiently finding them as well as schedules over hyperparameters.\nIn this paper, we compare existing PBT algorithms and contribute a new one: ROMUL, for RObust MULtistep search, which adapts its stepsize over the course of training.\nWe report competitive results with standard models on CIFAR (image classification) as well as Penn Tree Bank (language modeling), which both depend on heavy regularization.\nWe also open-source hoptim, a PBT library agnostic to the training framework, which is simple to use, reentrant, and provides good defaults with ROMUL.",
+    "title": "ROMUL: Scale Adaptative Population Based Training",
+    "authors": [
+      "Daniel HAZIZA",
+      "J\u00e9r\u00e9my Rapin",
+      "Gabriel Synnaeve"
+    ],
+    "emails": [
+      "~Gabriel_Synnaeve1",
+      "fb.com",
+      "~Daniel_HAZIZA2"
+    ],
+    "rank": 2393
+  },
+  {
+    "url": "https://openreview.net/forum?id=T1EMbxGNEJC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3,
+      6
+    ],
+    "rating": "4.35",
+    "confidences": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Semi-supervised learning (SSL) has played an important role in leveraging unlabeled data when labeled data is limited. One of the most successful SSL approaches is based on consistency regularization, which encourages the model to produce unchanged with perturbed input. However, there has been less attention spent on inputs that have the same label. Motivated by the observation that the inputs having the same label should have the similar model outputs, we propose a novel method, RankingMatch, that considers not only the perturbed inputs but also the similarity among the inputs having the same label. We especially introduce a new objective function, dubbed BatchMean Triplet loss, which has the advantage of computational efficiency while taking into account all input samples. Our RankingMatch achieves state-of-the-art performance across many standard SSL benchmarks with a variety of labeled data amounts, including 95.13% accuracy on CIFAR-10 with 250 labels, 77.65% accuracy on CIFAR-100 with 10000 labels, 97.76% accuracy on SVHN with 250 labels, and 97.77% accuracy on SVHN with 1000 labels. We also perform an ablation study to prove the efficacy of the proposed BatchMean Triplet loss against existing versions of Triplet loss.",
+    "title": "RankingMatch: Delving into Semi-Supervised Learning with Consistency Regularization and Ranking Loss",
+    "authors": [
+      "Trung Quang Tran",
+      "Mingu Kang",
+      "Daeyoung Kim"
+    ],
+    "emails": [
+      "~Trung_Quang_Tran1",
+      "~Mingu_Kang1",
+      "~Daeyoung_Kim3"
+    ],
+    "rank": 2394
+  },
+  {
+    "url": "https://openreview.net/forum?id=JvPsKam58LX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.35",
+    "confidences": [
+      3,
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In this paper we deal with robust cooperative multi-agent reinforcement learning (CMARL). While CMARL has many potential applications, only a trained policy that is robust enough can be confidently deployed in real world. Existing works on robust MARL mainly apply vanilla adversarial training in centralized training and decentralized execution paradigm. We, however, find that if a CMARL environment contains an adversarial agent, the performance of decentralized equilibrium might perform significantly poor for achieving such adversarial robustness. To tackle this issue, we suggest that when execution the non-adversarial agents must jointly make the decision to improve the robustness, therefore solving correlated equilibrium instead. We theoretically demonstrate the superiority of correlated equilibrium over the decentralized one in adversarial MARL settings. Therefore, to achieve robust CMARL, we introduce novel strategies to encourage agents to learn correlated equilibrium while maximally preserving the convenience of the decentralized execution. The global variables with mutual information are proposed to help agents learn robust policies with MARL algorithms. The experimental results show that our method can dramatically boost performance on the SMAC environments.",
+    "title": "Robust Multi-Agent Reinforcement Learning Driven by Correlated Equilibrium",
+    "authors": [
+      "Yizheng Hu",
+      "Kun Shao",
+      "Dong Li",
+      "Jianye HAO",
+      "Wulong Liu",
+      "Yaodong Yang",
+      "Jun Wang",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "~Wulong_Liu1",
+      "~Zhanxing_Zhu1",
+      "~Jianye_HAO1",
+      "~Yizheng_Hu1",
+      "~Jun_Wang2",
+      "~Yaodong_Yang1",
+      "~Kun_Shao1",
+      "~Dong_Li10"
+    ],
+    "rank": 2395
+  },
+  {
+    "url": "https://openreview.net/forum?id=kg63Tjpmc9Z",
+    "ratings": [
+      5,
+      3,
+      4,
+      6
+    ],
+    "rating": "4.35",
+    "confidences": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "abstract": "In this paper, we introduce pioneering algorithms for multi-view arbitrary style transfer. Multi-view arbitrary style transfer is an advanced study of the conventional monocular arbitrary style transfer, which aims to preserve the consistent style on the common region across the given arbitrary number of views. We intend to address the multi-view inconsistency problem by minimizing the difference in the color and feature values of the corresponding regions. However, the conventional feature extractors generally produce a feature vector of a point from its rectangular local patch, and such local patches are misaligned across the views in a multi-view environment due to various camera poses.  Thus, even if we assimilate the feature vectors of the corresponding pixels, since the spatial distribution of the surrounding feature vectors within their local patches are different and decoding such misaligned patches induces misaligned brushstrokes and geometric patterns, these feature vectors can be decoded to distinctive style texture. Based on the observation, we intend to interpret this challenging problem in terms of the photometric inconsistency and the stroke inconsistency. We propose a photometric consistency loss, which directly enforces the geometrically consistent style texture across the view, and a stroke consistency loss, which matches the characteristics and directions of the brushstrokes by aligning the local patches of the corresponding pixels before minimizing feature deviation. Then, We construct an optimization-based multi-view arbitrary style transfer framework (MVAST-O) with photometric and stroke consistency losses and extend it to a feed-forward framework (MVAST-FF) to overcome the chronic computational inefficiency issue of optimization-based algorithms. We validate our methods on the DTU dataset, a large-scale multi-view stereo dataset, and confirmed the superiority on preserving appearance consistency throughout the stylized multi-view images.",
+    "title": "Multi-view Arbitrary Style Transfer",
+    "authors": [
+      "Taekyung Kim",
+      "Changick Kim"
+    ],
+    "emails": [
+      "~Changick_Kim1",
+      "~Taekyung_Kim3"
+    ],
+    "rank": 2396
+  },
+  {
+    "url": "https://openreview.net/forum?id=4q8qGBf4Zxb",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.35",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Deep networks have been used to learn transferable representations for domain adaptation. Existing deep domain adaptation methods systematically employ popular hand-crafted networks designed specifically for image-classification tasks, leading to sub-optimal domain adaptation performance. In this paper, we present Neural Architecture Search for Domain Adaptation (NASDA), a principle framework that leverages differentiable neural architecture search to derive the optimal network architecture for domain adaptation task. NASDA is designed with two novel training strategies: neural architecture search with multi-kernel Maximum Mean Discrepancy to derive the optimal architecture, and adversarial training between a feature generator and a batch of classifiers to consolidate the feature generator. We demonstrate experimentally that NASDA leads to state-of-the-art performance on several domain adaptation benchmarks.",
+    "title": "Network Architecture Search for Domain Adaptation",
+    "authors": [
+      "Yichen Li",
+      "Xingchao Peng"
+    ],
+    "emails": [
+      "~Xingchao_Peng1",
+      "~Yichen_Li2"
+    ],
+    "rank": 2397
+  },
+  {
+    "url": "https://openreview.net/forum?id=98ntbCuqf4i",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.35",
+    "confidences": [
+      3,
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The principle of optimism in the face of (aleatoric and epistemic) uncertainty has been utilized to design efficient exploration strategies for Reinforcement Learning (RL). Different from most prior work targeting at discrete action space, we propose a generally information-theoretic exploration principle called Max-Q Entropy Search (MQES) for continuous RL algorithms.\nMQES formulates the exploration policy to maximize the information about the globally optimal distribution of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> function, which could explore optimistically and avoid over-exploration by recognizing the epistemic and aleatoric uncertainty, respectively. To make MQES practically tractable, we firstly incorporate distributional and ensemble <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> function approximations to MQES, which could formulate the epistemic and aleatoric uncertainty accordingly. Then, we introduce a constraint to stabilize the training and solve the constrained MQES problem to derive the exploration policy in closed form. Empirical evaluations show that MQES outperforms state-of-the-art algorithms on Mujoco environments.",
+    "title": "MQES: Max-Q Entropy Search for Efficient Exploration in Continuous Reinforcement Learning",
+    "authors": [
+      "Jinyi Liu",
+      "Zhi Wang",
+      "Jianye HAO",
+      "YAN ZHENG"
+    ],
+    "emails": [
+      "~Jianye_HAO1",
+      "~YAN_ZHENG1",
+      "~Zhi_Wang4",
+      "~Jinyi_Liu1"
+    ],
+    "rank": 2398
+  },
+  {
+    "url": "https://openreview.net/forum?id=pQq3oLH9UmL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.33",
+    "confidences": [
+      3,
+      1,
+      4,
+      4
+    ],
+    "abstract": "A visual hard attention model actively selects and observes a sequence of subregions in an image to make a prediction. Unlike in the deep convolution network, in hard attention it is explainable which regions of the image contributed to the prediction. However, the attention policy used by the model to select these regions is not explainable. The majority of hard attention models determine the attention-worthy regions by first analyzing a complete image. However, it may be the case that the entire image is not available in the beginning but instead sensed gradually through a series of partial observations. In this paper, we design an efficient hard attention model for classifying partially observable scenes. The attention policy used by our model is explainable and non-parametric. The model estimates expected information gain (EIG) obtained from attending various regions by predicting their content ahead of time. It compares EIG using Bayesian Optimal Experiment Design and attends to the region with maximum EIG. We train our model with a differentiable objective, optimized using gradient descent, and test it on several datasets. The performance of our model is comparable to or better than the baseline models.",
+    "title": "Achieving Explainability in a Visual Hard Attention Model through Content Prediction",
+    "authors": [
+      "Samrudhdhi Bharatkumar Rangrej",
+      "James J. Clark"
+    ],
+    "emails": [
+      "~Samrudhdhi_Bharatkumar_Rangrej1",
+      "~James_J._Clark1"
+    ],
+    "rank": 2399
+  },
+  {
+    "url": "https://openreview.net/forum?id=RepN5K31PT3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We study the problem of online convex optimization, where a learner makes sequential decisions to minimize an accumulation of strongly convex costs over time. The quality of decisions is given in terms of the dynamic regret, which measures the performance of the learner relative to a sequence of dynamic minimizers. Prior works on gradient descent and mirror descent have shown that the dynamic regret can be upper bounded using the path length, which depend on the differences between successive minimizers, and an upper bound using the squared path length has also been shown when multiple gradient queries are allowed per round. However, they all require the cost functions to be Lipschitz continuous, which imposes a strong requirement especially when the cost functions are also strongly convex. In this work, we consider Online Multiple Mirror Descent (OMMD), which is based on mirror descent but uses multiple mirror descent steps per online round. Without requiring the cost functions to be Lipschitz continuous, we derive two upper bounds on the dynamic regret based on the path length and squared path length. We further derive a third upper bound that relies on the gradient of cost functions, which can be much smaller than the path length or squared path length, especially when the cost functions are smooth but fluctuate over time. Thus, we show that the dynamic regret of OMMD scales linearly with the minimum among the path length, squared path length, and sum squared gradients. Our experimental results further show substantial improvement on the dynamic regret compared with existing alternatives.",
+    "title": "On the Dynamic Regret of Online Multiple Mirror Descent",
+    "authors": [
+      "Nima Eshraghi",
+      "and Ben Liang"
+    ],
+    "emails": [
+      "~Nima_Eshraghi1",
+      "~and_Ben_Liang1"
+    ],
+    "rank": 2400
+  },
+  {
+    "url": "https://openreview.net/forum?id=LnVNgfvrQjC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      4
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "abstract": "We tackle a novel few-shot learning challenge, few-shot semantic edge detection, aiming to localize boundaries of novel categories using only a few labeled samples. Reliable boundary information has been shown to boost the performance of semantic segmentation and localization, while also playing a key role in its own right in object reconstruction, image generation and medical imaging. Few-shot semantic edge detection allows recovery of accurate boundaries with just a few examples. In this work, we present a Class-Agnostic Few-shot Edge detection Network (CAFENet) based on meta-learning strategy. CAFENet employs a semantic segmentation module in small-scale to compensate for lack of semantic information in edge labels. The predicted segmentation mask is used to generate an attention map to highlight the target object region, and make the decoder module concentrate on that region. We also propose a new regularization method based on multi-split matching. In meta-training, the metric-learning problem with high-dimensional vectors are divided into smaller subproblems with low-dimensional sub-vectors. Since there are no existing datasets for few-shot semantic edge detection, we construct two new datasets, FSE-1000 and SBD-5i, and evaluate the performance of the proposed CAFENet on them. Extensive simulation results confirm that the proposed CAFENet achieves better performance compared to the baseline methods using fine-tuning or few-shot segmentation.",
+    "title": "CAFENet: Class-Agnostic Few-Shot Edge Detection Network",
+    "authors": [
+      "Younghyun Park",
+      "Jun Seo",
+      "Jaekyun Moon"
+    ],
+    "emails": [
+      "~Younghyun_Park1",
+      "~Jun_Seo1",
+      "~Jaekyun_Moon2"
+    ],
+    "rank": 2401
+  },
+  {
+    "url": "https://openreview.net/forum?id=p8EpbhCiSsF",
+    "ratings": [
+      3,
+      5,
+      3,
+      7
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Generative Adversarial Networks (GANs) have achieved large attention and great success in various research areas, but it still suffers from training instability. Recently hinge adversarial loss for GAN is proposed that incorporates the SVM margins where real and fake samples falling within the margins contribute to the loss calculation. In a generator training step, however, fake samples outside of the margins that partially include unrealistic local patterns are ignored. In this work, we propose local gradient amplifier(LGA) which realizes spatially decomposed hinge adversarial loss for improved generator training. Spatially decomposed hinge adversarial loss applies different margins for different spatial regions extending overall margin space toward all fake samples asymmetrically. Our proposed method is evaluated on several public benchmark data sets compared to state of the art methods showing outstanding stability in training GANs.",
+    "title": "Spatially Decomposed Hinge Adversarial Loss by Local Gradient Amplifier",
+    "authors": [
+      "Sanghun Kim",
+      "Seungkyu Lee"
+    ],
+    "emails": [
+      "~Seungkyu_Lee1",
+      "khu.ac.kr"
+    ],
+    "rank": 2402
+  },
+  {
+    "url": "https://openreview.net/forum?id=7dmdzJz42Ro",
+    "ratings": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Contrastive approaches to representation learning have recently shown great promise. In contrast to generative approaches, these contrastive models learn a deterministic encoder with no notion of uncertainty or confidence. In this paper, we introduce a simple approach based on \"contrasting distributions\" that learns to assign uncertainty for pretrained contrastive representations. In particular, we train a deep network from a representation to a distribution in representation space, whose variance can be used as a measure of confidence. In our experiments, we show that this deep uncertainty model can be used (1) to visually interpret model behavior, (2) to detect new noise in the input to deployed models, (3) to detect anomalies, where we outperform 10 baseline methods across 11 tasks with improvements of up to 14% absolute, and (4) to classify out-of-distribution examples where our fully unsupervised model is competitive with supervised methods.",
+    "title": "A Simple Framework for Uncertainty in Contrastive Learning",
+    "authors": [
+      "Mike Wu",
+      "Noah Goodman"
+    ],
+    "emails": [
+      "~Noah_Goodman1",
+      "~Mike_Wu1"
+    ],
+    "rank": 2403
+  },
+  {
+    "url": "https://openreview.net/forum?id=auAe8kqRe7f",
+    "ratings": [
+      4,
+      4,
+      5
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "Adversarial training is one of the most effective methods for defending adversarial attacks, but it is computationally costly. In this paper, we propose Saliency Adversarial Defense (SAD), an efficient defense algorithm that avoids adversarial training.  The saliency map is added to the input with a hybridization ratio to enhance those pixels that are important for making decisions. This process causes a distribution shift to the original data. Interestingly, we find that this shift can be effectively fixed by updating the statistics of batch normalization with the processed data without further training.  We justify the algorithm with a linear model that the added saliency maps pull data away from its closest decision boundary. Updating BN effectively evolves the decision boundary to fit the new data. As a result, the distance between the decision boundary and the original inputs are increased such that the model is able to defend stronger attacks and thus improve robustness. Then we show in experiments that the results still hold for complex models and datasets. Our results demonstrate that SAD is superior in defending various attacks, including both white-box and black-box ones. ",
+    "title": "SAD: Saliency Adversarial Defense without Adversarial Training",
+    "authors": [
+      "Yao Zhu",
+      "Jiacheng Sun",
+      "Zewei Chen",
+      "Zhenguo Li"
+    ],
+    "emails": [
+      "~Yao_Zhu2",
+      "~Zewei_Chen1",
+      "~Jiacheng_Sun1",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 2404
+  },
+  {
+    "url": "https://openreview.net/forum?id=cmcwUBKeoUH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Monitoring blood oxygen is critical in a variety of medical conditions. For almost a century, pulse oximetry has been the only non-invasive method for measuring blood oxygen. While highly useful, pulse oximetry has important limitations. It requires wearable sensors, which can be cumbersome for older patients. It is also known to be biased when used for dark-skinned subjects. In this paper, we demonstrate, for the first time, the feasibility of predicting oxygen saturation from breathing. By eliminating the dependency on oximetry, we eliminate bias against skin color. Further, since breathing can be monitored without body contact by analyzing the radio signal in the environment, we show that oxygen too can be monitored without any wearable devices. We introduce a new approach for leveraging auxiliary variables via a switcher-based multi-headed neural network model. Empirical results show that our model achieves good accuracy on multiple medical datasets.",
+    "title": "Learning Blood Oxygen from Respiration Signals",
+    "authors": [
+      "Hao He",
+      "Ying-Cong Chen",
+      "Yuan Yuan",
+      "Dina Katabi"
+    ],
+    "emails": [
+      "~Hao_He1",
+      "~Dina_Katabi1",
+      "~Ying-Cong_Chen1",
+      "~Yuan_Yuan5"
+    ],
+    "rank": 2405
+  },
+  {
+    "url": "https://openreview.net/forum?id=n5go16HF_B",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Asynchronous stochastic discrete event based processes are commonplace in application domains such as social science, homeland security, and health informatics.  Modeling complex interactions of such event data via marked temporal point processes (MTPPs) provides the ability of detection and prediction of specific interests or profiles. We present a novel multi-category MTPP generation technique for applications where training datasets are inherently sparse, incomplete, and small. The proposed adversarial architecture augments adversarial autoencoder (AAE) with feature mapping techniques, which includes a transformation between the categories and timestamps of marked points and the percentile distribution of the particular category.  The transformation of training data to the distribution facilitates the accurate capture of underlying process characteristics despite the sparseness and incompleteness of data. The proposed method is validated using several benchmark datasets. The similarity between actual and generated MTPPs is evaluated and compared with a Markov process based baseline. Results demonstrate the effectiveness and robustness of the proposed technique.",
+    "title": "Adversarial Data Generation of Multi-category Marked Temporal Point Processes with Sparse, Incomplete, and Small Training Samples",
+    "authors": [
+      "Shashika Ranga Muramudalige",
+      "Anura Jayasumana",
+      "Haonan Wang"
+    ],
+    "emails": [
+      "~Haonan_Wang2",
+      "~Shashika_Ranga_Muramudalige1",
+      "~Anura_Jayasumana1"
+    ],
+    "rank": 2406
+  },
+  {
+    "url": "https://openreview.net/forum?id=M6PP1Gq076C",
+    "ratings": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Bootstrapping has been a primary tool for uncertainty quantification, and their theoretical and computational properties have been investigated in the field of statistics and machine learning. However, due to its nature of repetitive computations, the computational burden required to implement bootstrap procedures for the neural network is painfully heavy, and this fact seriously hurdles the practical use of these procedures on the uncertainty estimation of modern deep learning. To overcome the inconvenience, we propose a procedure called \\emph{Neural Bootstrapper} (NeuBoots). We reveal that the NeuBoots stably generate valid bootstrap samples that coincide with the desired target samples with minimal extra computational cost compared to traditional bootstrapping. \nConsequently, NeuBoots makes it feasible to construct bootstrap confidence intervals of outputs of neural networks and quantify their predictive uncertainty. We also suggest NeuBoots for deep convolutional neural networks to consider its utility in image classification tasks, including calibration, detection of out-of-distribution samples, and active learning. Empirical results demonstrate that NeuBoots is significantly beneficial for the above purposes.  ",
+    "title": "Neural Bootstrapper",
+    "authors": [
+      "Minsuk Shin",
+      "Hyungjoo Cho",
+      "Sungbin Lim"
+    ],
+    "emails": [
+      "~Sungbin_Lim1",
+      "mailbox.sc.edu",
+      "~Hyungjoo_Cho1"
+    ],
+    "rank": 2407
+  },
+  {
+    "url": "https://openreview.net/forum?id=0_ao8yS2eBw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "There have been increasing challenges to solve combinatorial optimization problems by machine learning. \nKhalil et al. (NeurIPS 2017) proposed an end-to-end reinforcement learning framework, which automatically learns graph embeddings to construct solutions to a wide range of problems.\nHowever, it sometimes performs poorly on graphs having different characteristics than training graphs.\nTo improve its generalization ability to various graphs, we propose a novel learning strategy based on AlphaGo Zero, a Go engine that achieved a superhuman level without the domain knowledge of the game.\nWe redesign AlphaGo Zero for combinatorial optimization problems, taking into account several differences from two-player games.\nIn experiments on five NP-hard problems such as {\\sc MinimumVertexCover} and {\\sc MaxCut}, our method, with only a policy network, shows better generalization than the previous method to various instances that are not used for training, including random graphs, synthetic graphs, and real-world graphs.\nFurthermore, our method is significantly enhanced by a test-time Monte Carlo Tree Search which makes full use of the policy network and value network.\nWe also compare recently-developed graph neural network (GNN) models, with an interesting insight into a suitable choice of GNN models for each task.",
+    "title": "Solving NP-Hard Problems on Graphs with Extended AlphaGo Zero",
+    "authors": [
+      "Kenshin Abe",
+      "Zijian Xu",
+      "Issei Sato",
+      "Masashi Sugiyama"
+    ],
+    "emails": [
+      "~Issei_Sato1",
+      "~Kenshin_Abe1",
+      "~Zijian_Xu1",
+      "~Masashi_Sugiyama1"
+    ],
+    "rank": 2408
+  },
+  {
+    "url": "https://openreview.net/forum?id=K4wkUp5xNK",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Due to spurious correlations, machine learning systems often fail to generalize to environments whose distributions differ from the ones used at training time. Prior work addressing this, either explicitly or implicitly, attempted to find a data representation that has an invariant causal relationship with the outcome. This is done by leveraging a diverse set of training environments to reduce the effect of spurious features, on top of which an invariant classifier is then built. However, these methods have generalization guarantees only when both data representation and classifiers come from a linear model class. As an alternative, we propose Invariant Causal Representation Learning (ICRL), a learning paradigm that enables out-of-distribution generalization in the nonlinear setting (i.e., nonlinear representations and nonlinear classifiers). It builds upon a practical and general assumption: data representations factorize when conditioning on the outcome and the environment. Based on this, we show identifiability up to a permutation and pointwise transformation. We also prove that all direct causes of the outcome can be fully discovered, which further enables us to obtain generalization guarantees in the nonlinear setting. Extensive experiments on both synthetic and real-world datasets show that our approach significantly outperforms a variety of baseline methods.",
+    "title": "Invariant Causal Representation Learning",
+    "authors": [
+      "Chaochao Lu",
+      "Yuhuai Wu",
+      "Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1",
+      "~Chaochao_Lu1",
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Yuhuai_Wu1"
+    ],
+    "rank": 2409
+  },
+  {
+    "url": "https://openreview.net/forum?id=lWaz5a9lcFU",
+    "decision": "Accept (Poster)",
+    "ratings": [
+      4,
+      6,
+      4
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      2,
+      5
+    ],
+    "abstract": "The two main impediments to continual learning are catastrophic forgetting and memory limitations on the storage of data. To cope with these challenges, we propose a novel, cognitively-inspired approach which trains autoencoders with Neural Style Transfer to encode and store images. Reconstructed images from encoded episodes are replayed when training the classifier model on a new task to avoid catastrophic forgetting. The loss function for the reconstructed images is weighted to reduce its effect during classifier training to cope with image degradation. When the system runs out of memory the encoded episodes are converted into centroids and covariance matrices, which are used to generate pseudo-images during classifier training, keeping classifier performance stable with less memory. Our approach increases classification accuracy by 13-17% over state-of-the-art methods on benchmark datasets, while requiring 78% less storage space.",
+    "title": "EEC: Learning to Encode and Regenerate Images for Continual Learning",
+    "authors": [
+      "Ali Ayub",
+      "Alan Wagner"
+    ],
+    "emails": [
+      "~Alan_Wagner2",
+      "~Ali_Ayub1"
+    ],
+    "rank": 2410
+  },
+  {
+    "url": "https://openreview.net/forum?id=dcktlmtcM7",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Partial differential equations (PDEs) play a crucial role in studying a vast number of problems in science and engineering. Numerically solving nonlinear and/or high-dimensional PDEs is frequently a challenging task. Inspired by the traditional finite difference and finite elements methods and emerging advancements in machine learning, we propose a  sequence-to-sequence learning (Seq2Seq) framework called Neural-PDE, which allows one to automatically learn governing rules of any time-dependent PDE system from existing data by using a bidirectional LSTM encoder, and predict the solutions in next <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> time steps. One critical feature of our proposed framework is that the Neural-PDE is able to  simultaneously learn and simulate all variables of interest in a PDE system. We test the Neural-PDE by a range of  examples, from one-dimensional PDEs to a multi-dimensional and nonlinear complex fluids model. The results show that the Neural-PDE is capable of learning the initial conditions, boundary conditions and differential operators defining the initial-boundary-value problem of a PDE system without the knowledge of the specific form of the PDE system. In our experiments, the Neural-PDE can efficiently extract the dynamics within 20 epochs training and produce accurate predictions. Furthermore, unlike the traditional machine learning approaches for learning PDEs, such as CNN and MLP, which require great quantity of parameters for model precision, the Neural-PDE shares parameters among all time steps, and thus considerably reduces computational complexity and leads to a fast learning algorithm. ",
+    "title": "Neural Time-Dependent Partial Differential Equation",
+    "authors": [
+      "Yihao Hu",
+      "Tong Zhao",
+      "Zhiliang Xu",
+      "Lizhen Lin"
+    ],
+    "emails": [
+      "~Zhiliang_Xu1",
+      "~Lizhen_Lin1",
+      "~Tong_Zhao3",
+      "~Yihao_Hu1"
+    ],
+    "rank": 2411
+  },
+  {
+    "url": "https://openreview.net/forum?id=0aZG2VcWLY",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      7,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In many animal sensory pathways, the transformation from external stimuli to spike trains is essentially deterministic. In this context, a new mathematical framework for coding and reconstruction, based on a biologically plausible model of the spiking neuron, is presented. The framework considers encoding of a signal through spike trains generated by an ensemble of neurons via a standard convolve-then-threshold mechanism, albeit with a wide variety of convolution kernels. Neurons are distinguished by their convolution kernels and threshold values. Reconstruction is posited as a convex optimization minimizing energy. Formal conditions under which perfect reconstruction of the signal from the spike trains is possible are then identified. Coding experiments on a large audio dataset are presented to demonstrate the strength of the framework.",
+    "title": "Signal Coding and Reconstruction using Spike Trains",
+    "authors": [
+      "Anik Chattopadhyay",
+      "Arunava Banerjee"
+    ],
+    "emails": [
+      "~Arunava_Banerjee2",
+      "~Anik_Chattopadhyay1"
+    ],
+    "rank": 2412
+  },
+  {
+    "url": "https://openreview.net/forum?id=nzLFm097HI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "In multi-modal reasoning tasks, such as visual question answering (VQA), there have been many modeling and training paradigms tested. Previous models propose different methods for the vision and language tasks, but which ones perform the best while being sample and computationally efficient? Based on our experiments, we find that representing the text as probabilistic programs and images as object-level scene graphs best satisfy these desiderata. We extend existing models to leverage these soft programs and scene graphs to train on question answer pairs in an end-to-end manner. Empirical results demonstrate that this differentiable end-to-end program executor is able to maintain state-of-the-art accuracy while being sample and computationally efficient.",
+    "title": "How to Design Sample and Computationally Efficient VQA Models",
+    "authors": [
+      "Karan Samel",
+      "Zelin Zhao",
+      "Kuan Wang",
+      "Robin Luo",
+      "Binghong Chen",
+      "Le Song"
+    ],
+    "emails": [
+      "~Karan_Samel2",
+      "gatech.edu",
+      "~Binghong_Chen1",
+      "~Le_Song1",
+      "~Zelin_Zhao1",
+      "~Kuan_Wang1"
+    ],
+    "rank": 2413
+  },
+  {
+    "url": "https://openreview.net/forum?id=JthLaV0RsV",
+    "ratings": [
+      4,
+      6,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Despite the recent success of large-scale language models on various downstream NLP tasks, the repetition and inconsistency problems still persist in dialogue response generation. Previous approaches have attempted to avoid repetition by penalizing the language model's undesirable behaviors in the loss function. However, these methods focus on token-level information and can lead to incoherent responses and uninterpretable behaviors. To alleviate these issues, we propose to apply reinforcement learning to refine an MLE-based language model without user simulators, and distill sentence-level information about repetition, inconsistency and task relevance through rewards. In addition, to better accomplish the dialogue task, the model learns from human demonstration to imitate intellectual activities such as persuasion, and selects the most persuasive responses. Experiments show that our model outperforms previous state-of-the-art dialogue models on both automatic metrics and human evaluation results on a donation persuasion task, and generates more diverse, consistent and persuasive conversations according to the user feedback. We will release the code and data upon acceptance.",
+    "title": "Refine and Imitate: Reducing Repetition and Inconsistency in Dialogue Generation via Reinforcement Learning and Human Demonstration",
+    "authors": [
+      "Weiyan Shi",
+      "Yu Li",
+      "Saurav Sahay",
+      "Zhou Yu"
+    ],
+    "emails": [
+      "~Yu_Li6",
+      "~Zhou_Yu1",
+      "~Saurav_Sahay1",
+      "~Weiyan_Shi2"
+    ],
+    "rank": 2414
+  },
+  {
+    "url": "https://openreview.net/forum?id=o7YTArVXdEW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5
+    ],
+    "rating": "4.33",
+    "confidences": [
+      3,
+      3,
+      3
+    ],
+    "abstract": "Unsupervised representation learning is essential in the field of machine learning, and accurate neighbor clusters of representation show great potential to support unsupervised image classification. This paper proposes a VAE (Variational Autoencoder) based network and a clustering method to achieve adaptive neighbor clustering to support the self-supervised classification. The proposed network encodes the image into the representation with boundary information, and the proposed cluster method takes advantage of the boundary information to deliver adaptive neighbor cluster results.  Experimental evaluations show that the proposed method outperforms state-of-the-art representation learning methods in terms of neighbor clustering accuracy. Particularly, AC-VAE achieves 95\\% and 82\\% accuracy on CIFAR10 dataset when the average neighbor cluster sizes are 10 and 100. Furthermore, the neighbor cluster results are found converge within the clustering range (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2264\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi><mo>\u2264</mo><mn>2</mn></math></mjx-assistive-mml></mjx-container>), and the converged neighbor clusters are used to support the self-supervised classification. The proposed method delivers classification results that are competitive with the state-of-the-art and reduces the super parameter <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> in KNN (K-nearest neighbor), which is often used in self-supervised classification.",
+    "title": "AC-VAE: Learning Semantic Representation with VAE for Adaptive Clustering",
+    "authors": [
+      "Xingyu Xie",
+      "Minjuan Zhu",
+      "Yan Wang",
+      "Lei Zhang"
+    ],
+    "emails": [
+      "~Xingyu_Xie2",
+      "~Yan_Wang13",
+      "~Lei_Zhang25",
+      "~Minjuan_Zhu1"
+    ],
+    "rank": 2415
+  },
+  {
+    "url": "https://openreview.net/forum?id=_77KiX2VIEg",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Deep neural networks represent the gold standard for image classification.\nHowever, they usually need large amounts of data to reach superior performance.\nIn this work, we focus on image classification problems with a few labeled examples per class and improve sample efficiency in the low data regime by using an ensemble of relatively small deep networks. \nFor the first time, our work broadly studies the existing concept of neural ensembling in small data domains, through an extensive validation using popular datasets and architectures.\nWe show that deep ensembling is a simple yet effective technique that outperforms current state-of-the-art approaches for learning from small datasets.\nWe compare different ensemble configurations to their deeper and wider competitors given a total fixed computational budget and provide empirical evidence of their advantage.\nFurthermore, we investigate the effectiveness of different losses and show that their choice should be made considering different factors.",
+    "title": "On the Effectiveness of Deep Ensembles for Small Data Tasks",
+    "authors": [
+      "Lorenzo Brigato",
+      "Luca Iocchi"
+    ],
+    "emails": [
+      "~Lorenzo_Brigato1",
+      "~Luca_Iocchi1"
+    ],
+    "rank": 2416
+  },
+  {
+    "url": "https://openreview.net/forum?id=JUc6-1xuOX",
+    "ratings": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "abstract": "We introduce a deep latent recommender system named deepLTRS in order to provide users with high quality recommendations based on observed user ratings and texts of product reviews. The underlying motivation is that, when a user scores only a few products, the texts used in the reviews represent a significant source of information. The addition of review information can alleviate data sparsity, thereby enhancing the predictive ability of the model. Our approach adopts a variational auto-encoder architecture as a generative deep latent variable model for both an ordinal matrix encoding users scores about products, and a document-term matrix encoding the reviews. Moreover, different from unique user-based or item-based models, deepLTRS assumes latent representations for both users and products. An alternated user/product mini-batching optimization structure is proposed to jointly capture user and product preferences. Numerical experiments on simulated and real-world data sets demonstrate that deepLTRS outperforms the state-of-the-art, in particular in contexts of extreme data sparsity.",
+    "title": "DeepLTRS: A Deep Latent Recommender System based on User Ratings and Reviews",
+    "authors": [
+      "Dingge LIANG",
+      "Marco Corneli",
+      "pierre Latouche",
+      "Charles Bouveyron"
+    ],
+    "emails": [
+      "~Dingge_LIANG1",
+      "~Marco_Corneli1",
+      "~Charles_Bouveyron2",
+      "parisdescartes.fr"
+    ],
+    "rank": 2417
+  },
+  {
+    "url": "https://openreview.net/forum?id=l_LGi6xeNT9",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      3,
+      5
+    ],
+    "rating": "4.33",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Current deep learning architectures that make use of the 3D convolution (3DConv) achieve state-of-the-art results on action recognition benchmarks. However, the 3DConv does not easily lend itself to explainable model decisions. To this end we introduce a novel and intrinsic approach, whereby all the aspects of the 3DConv are rendered explainable. Our approach proposes the temporally factorized 3D convolution (3TConv) as an interpretable alternative to the regular 3DConv. In a 3TConv the 3D convolutional filter is obtained by learning a 2D filter and a set of temporal transformation parameters, resulting in a sparse filter requiring less parameters. We demonstrate that 3TConv learns temporal transformations that afford a direct interpretation by analyzing the transformation parameter statistics on a model level. Our experiments show that in the low-data regime the 3TConv outperforms 3DConv and R(2+1)D while containing up to 77\\% less parameters.",
+    "title": "The 3TConv: An Intrinsic Approach to Explainable 3D CNNs",
+    "authors": [
+      "Gabrielle Ras",
+      "Luca Ambrogioni",
+      "Pim Haselager",
+      "Marcel van Gerven",
+      "Umut G\u00fc\u00e7l\u00fc"
+    ],
+    "emails": [
+      "donders.ru.nl",
+      "~Umut_G\u00fc\u00e7l\u00fc1",
+      "~Luca_Ambrogioni1",
+      "~Marcel_van_Gerven1",
+      "~Gabrielle_Ras1"
+    ],
+    "rank": 2418
+  },
+  {
+    "url": "https://openreview.net/forum?id=160xFQdp7HR",
+    "decision": "Reject",
+    "ratings": [
+      8,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.33",
+    "confidences": [
+      1,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We propose an artificial life framework aimed at facilitating the emergence of intelligent organisms. In this framework there is no explicit notion of an agent: instead there is an environment made of atomic elements. These elements contain neural operations and interact through exchanges of information and through physics-like rules contained in the environment. We discuss how an evolutionary process can lead to the emergence of different organisms made of many such atomic elements which can coexist and thrive in the environment. We discuss how this forms the basis of a general AI generating algorithm. We provide a simplified implementation of such system and discuss what advances need to be made to scale it up further.",
+    "title": "Self-Organizing Intelligent Matter:  A blueprint for an AI generating algorithm",
+    "authors": [
+      "Karol Gregor",
+      "Frederic Besse"
+    ],
+    "emails": [
+      "~Karol_Gregor1",
+      "~Frederic_Besse1"
+    ],
+    "rank": 2419
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z_3x5eFk1l-",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      6,
+      5
+    ],
+    "rating": "4.32",
+    "confidences": [
+      5,
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Meta-learning enables a model to learn from very limited data to undertake a new task. In this paper, we study the general meta-learning with adversarial samples. We present a meta-learning algorithm, ADML (ADversarial Meta-Learner), which leverages clean and adversarial samples to optimize the initialization of a learning model in an adversarial manner. ADML leads to the following desirable properties: 1) it turns out to be very effective even in the cases with only clean samples; 2) it is robust to adversarial samples, i.e., unlike other meta-learning algorithms, it only leads to a minor performance degradation when there are adversarial samples; 3) it sheds light on tackling the cases with limited and even contaminated samples. It has been shown by extensive experimental results that ADML outperforms several representative meta-learning algorithms in the cases involving adversarial samples generated by different attack mechanisms, on two widely-used image datasets, MiniImageNet and CIFAR100, in terms of both accuracy and robustness.",
+    "title": "Adversarial Meta-Learning",
+    "authors": [
+      "Chengxiang Yin",
+      "Jian Tang",
+      "Zhiyuan Xu",
+      "Yanzhi Wang"
+    ],
+    "emails": [
+      "~Zhiyuan_Xu1",
+      "~Jian_Tang5",
+      "~Yanzhi_Wang2",
+      "~Chengxiang_Yin1"
+    ],
+    "rank": 2420
+  },
+  {
+    "url": "https://openreview.net/forum?id=Al7Wpsy49g",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      5,
+      4,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.32",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Humans integrate multiple sensory modalities (e.g., visual and audio) to build a causal understanding of the physical world. In this work, we propose a novel type of intrinsic motivation for Reinforcement Learning (RL) that encourages the agent to understand the causal effect of its actions through auditory event prediction. First, we allow the agent to collect a small amount of acoustic data and use K-means to discover underlying auditory event clusters. We then train a neural network to predict the auditory events and use the prediction errors as intrinsic rewards to guide RL exploration.  We first conduct an in-depth analysis of our module using a set of Atari games. We then apply our model to audio-visual exploration using the Habitat simulator and active learning using the TDW simulator. Experimental results demonstrate the advantages of using audio signals over vision-based models as intrinsic rewards to guide RL explorations.",
+    "title": "Noisy Agents: Self-supervised Exploration by Predicting Auditory Events",
+    "authors": [
+      "Chuang Gan",
+      "Xiaoyu Chen",
+      "Phillip Isola",
+      "Antonio Torralba",
+      "Joshua B. Tenenbaum"
+    ],
+    "emails": [
+      "~Antonio_Torralba1",
+      "~Phillip_Isola1",
+      "~Xiaoyu_Chen4",
+      "~Joshua_B._Tenenbaum1",
+      "~Chuang_Gan1"
+    ],
+    "rank": 2421
+  },
+  {
+    "url": "https://openreview.net/forum?id=igkmo23BgzB",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      3
+    ],
+    "rating": "4.32",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Quantization of neural network parameters and activations has emerged as a successful approach to reducing the model size and inference time on hardware that sup-ports native low-precision arithmetic.   Fully quantized training would facilitate further computational speed-ups as well as enable model training on embedded devices, a feature that would alleviate privacy concerns resulting from the transfer of sensitive data and models that is necessitated by off-device training.  Existing approaches to quantization-aware training (QAT) perform \u201cfake\u201d quantization in the forward pass in order to learn model parameters that will perform well when quantized, but rely on higher precision variables to avoid overflow in large matrix multiplications, which is unsuitable for training on fully low-precision (e.g. 8-bit)hardware. To enable fully end-to-end quantized training, we propose Log BarrierTail-bounded Quantization (LogBTQ). LogBTQ introduces a loss term, inspired by the log-barrier for constrained optimization, that enforces soft constraints on the range of values that model parameters can take on. By constraining and sparsifying model parameters, activations and inputs, our approach eliminates over-flow in practice, allowing for fully quantized 8-bit training of deep neural network models. We show that models trained using our approach achieve results competitive with state-of-the-art full-precision networks on the MNIST, CIFAR-10 andImageNet classification benchmarks.",
+    "title": "End-to-end Quantized Training via Log-Barrier Extensions",
+    "authors": [
+      "Juncheng B Li",
+      "Shuhui Qu",
+      "Xinjian Li",
+      "Emma Strubell",
+      "Florian Metze"
+    ],
+    "emails": [
+      "~Juncheng_B_Li1",
+      "~Shuhui_Qu1",
+      "~Emma_Strubell1",
+      "~Florian_Metze1",
+      "~Xinjian_Li2"
+    ],
+    "rank": 2422
+  },
+  {
+    "url": "https://openreview.net/forum?id=yM5rtGA4pNX",
+    "ratings": [
+      5,
+      6,
+      3,
+      3
+    ],
+    "rating": "4.32",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "High-fidelity singing voices usually require higher sampling rate (e.g., 48kHz, compared with 16kHz or 24kHz in speaking voices) with large range of frequency to convey rich expression and emotion. However, higher sampling rate results in wider frequency band and longer waveform sequence with more fine-grained details and presents challenges for singing modeling in both frequency and time domains in singing voice synthesis (SVS). In this paper, we develop HiFiSinger, an SVS system towards high-fidelity singing voice using 48kHz sampling rate. HiFiSinger consists of a FastSpeech based neural acoustic model and a Parallel WaveGAN based neural vocoder to ensure fast training and inference and also high voice quality. To tackle the difficulty of singing modeling caused by high sampling rate (wider frequency band and longer waveform), we introduce multi-scale adversarial training in both the acoustic model and vocoder to improve singing modeling. Specifically, 1) To handle the larger range of frequencies caused by higher sampling rate (e.g., 48kHz vs. 24kHz), we introduce a novel sub-frequency GAN (SF-GAN) on mel-spectrogram generation, which splits the full 80-dimensional mel-frequency into multiple sub-bands (e.g. low, middle and high frequency bands) and models each sub-band with a separate discriminator. 2) To model longer waveform sequences caused by higher sampling rate, we introduce a multi-length GAN (ML-GAN) for waveform generation to model different lengths of waveform sequences with separate discriminators. 3) We also introduce several additional designs in HiFiSinger that are crucial for high-fidelity voices, such as adding F0 (pitch) and V/UV (voiced/unvoiced flag) as acoustic features, choosing an appropriate window and hop size for mel-spectrogram, and increasing the receptive field in vocoder for long vowel modeling in singing voices. Experiment results show that HiFiSinger synthesizes high-fidelity singing voices with much higher quality: 0.32/0.44 MOS gain over 48kHz/24kHz baseline and 0.83 MOS gain over previous SVS systems. Audio samples are available at <a href=\"https://hifisinger.github.io.\" target=\"_blank\" rel=\"nofollow\">https://hifisinger.github.io.</a>",
+    "title": "HiFiSinger: Towards High-Fidelity Neural Singing Voice Synthesis",
+    "authors": [
+      "Jiawei Chen",
+      "Xu Tan",
+      "Jian luan",
+      "Tao Qin",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Tao_Qin1",
+      "~Xu_Tan1",
+      "microsoft.com",
+      "~Tie-Yan_Liu1",
+      "~Jiawei_Chen4"
+    ],
+    "rank": 2423
+  },
+  {
+    "url": "https://openreview.net/forum?id=Zu3iPlzCe9J",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      3
+    ],
+    "rating": "4.31",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "We formally define a feature-space attack where the adversary can perturb datapoints by arbitrary amounts but in restricted directions. By restricting the attack to a small random subspace, our model provides a clean abstraction for non-Lipschitz networks which map small input movements to large feature movements. We prove that classifiers with the ability to abstain are provably more powerful than those that cannot in this setting. Specifically, we show that no matter how well-behaved the natural data is, any classifier that cannot abstain will be defeated by such an adversary.  However, by allowing abstention, we give a parameterized algorithm with provably good performance against such an adversary when classes are reasonably well-separated in feature space and the dimension of the feature space is high. We further use a data-driven method to set our algorithm parameters to optimize over the accuracy vs. abstention trade-off with strong theoretical guarantees. Our theory has direct applications to the technique of contrastive learning, where we empirically demonstrate the ability of our algorithms to obtain high robust accuracy with only small amounts of abstention in both supervised and self-supervised settings. Our results provide a first formal abstention-based gap, and a first provable optimization for the induced trade-off in an adversarial defense setting.",
+    "title": "On the Power of Abstention and Data-Driven Decision Making for Adversarial Robustness",
+    "authors": [
+      "Nina Balcan",
+      "Avrim Blum",
+      "Dravyansh Sharma",
+      "Hongyang Zhang"
+    ],
+    "emails": [
+      "~Avrim_Blum1",
+      "~Dravyansh_Sharma1",
+      "~Nina_Balcan1",
+      "~Hongyang_Zhang1"
+    ],
+    "rank": 2424
+  },
+  {
+    "url": "https://openreview.net/forum?id=LcPefbNSwx_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.31",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Deep neural network (DNN) models often involve features of high dimensions. In most cases, the high-dimensional features can be decomposed into two parts. The first part is a low-dimensional factor. The second part is the residual feature, with much-reduced variability and inter-feature correlation. This leads to a number of interesting theoretical findings for deep neural network training. Accordingly, we are inspired to develop a new factor normalization method for better performance. The proposed method leads to a new deep learning model with two important features. First, it allows factor related feature extraction. Second, it allows adaptive learning rates for factors and residuals, respectively. This leads to fast convergence speed on both training and validation datsets. A number of empirical experiments are presented to demonstrate its superior performance. The code is available at <a href=\"https://github.com/HazardNeo4869/FactorNormalization\" target=\"_blank\" rel=\"nofollow\">https://github.com/HazardNeo4869/FactorNormalization</a>",
+    "title": "Factor Normalization for Deep Neural Network Models",
+    "authors": [
+      "Haobo Qi",
+      "Jing Zhou",
+      "Hansheng Wang"
+    ],
+    "emails": [
+      "~Jing_Zhou3",
+      "pku.edu.cn"
+    ],
+    "rank": 2425
+  },
+  {
+    "url": "https://openreview.net/forum?id=_0shJdtXI1f",
+    "ratings": [
+      4,
+      3,
+      6
+    ],
+    "rating": "4.31",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "An emerging line of research has found that \\emph{hyperspherical} spaces better match the underlying geometry of facial images, as evidenced by the state-of-the-art facial recognition methods which benefit empirically from hyperspherical representations. Yet, these approaches rely on deterministic embeddings and hence suffer from the \\emph{feature ambiguity dilemma}, whereby ambiguous or noisy images are mapped into poorly learned regions of representation space, leading to inaccuracies~\\citep{shi2019probabilistic}. PFE is the first attempt to circumvent this dilemma. However, we theoretically and empirically identify two main failure cases of PFE when it is applied to hyperspherical deterministic embeddings aforementioned. To address these issues, in this paper, we propose a novel framework for face uncertainty learning in hyperspherical space. Mathematically, we extend the \\emph{von Mises Fisher} density to its <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>r</mi></math></mjx-assistive-mml></mjx-container>-radius counterpart and derive an optimization objective in closed form. For feature comparison, we also derive a closed-form mutual likelihood score for latents lying on hypersphere. Extensive experimental results on multiple challenging benchmarks confirm our hypothesis and theory, and showcase the superior performance of our framework against prior probabilistic methods and conventional hyperspherical deterministic embeddings both in risk-controlled recognition tasks and in face verification and identification tasks.",
+    "title": "Hypersphere Face Uncertainty Learning",
+    "authors": [
+      "Shen Li",
+      "Jianqing Xu",
+      "Xiaqing Xu",
+      "Pengcheng Shen",
+      "Shaoxin Li",
+      "Bryan Hooi"
+    ],
+    "emails": [
+      "~Shen_Li2",
+      "~Pengcheng_Shen1",
+      "~Jianqing_Xu1",
+      "~Xiaqing_Xu1",
+      "~Bryan_Hooi1",
+      "~Shaoxin_Li2"
+    ],
+    "rank": 2426
+  },
+  {
+    "url": "https://openreview.net/forum?id=o2N6AYOp31",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.31",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "We aim to build image generation models that generalize to new domains from few examples. To this end, we first investigate the generalization properties of classic image generators, and discover that autoencoders generalize extremely well to new domains, even when trained on highly constrained data. We leverage this insight to produce a robust, unsupervised few-shot image generation algorithm, and introduce a novel training procedure based on recovering an image from data augmentations. Our Augmentation-Interpolative AutoEncoders synthesize realistic images of novel objects from only a few reference images, and outperform both prior interpolative models and supervised few-shot image generators. Our procedure is simple and lightweight, generalizes broadly, and requires no category labels or other supervision during training.",
+    "title": "Augmentation-Interpolative AutoEncoders for Unsupervised Few-Shot Image Generation",
+    "authors": [
+      "Davis Wertheimer",
+      "Omid Poursaeed",
+      "Bharath Hariharan"
+    ],
+    "emails": [
+      "~Davis_Wertheimer1",
+      "~Bharath_Hariharan3",
+      "~Omid_Poursaeed2"
+    ],
+    "rank": 2427
+  },
+  {
+    "url": "https://openreview.net/forum?id=FcfH5Pskt2G",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.31",
+    "confidences": [
+      5,
+      2,
+      4,
+      2
+    ],
+    "abstract": "The performance of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-Variational-Autoencoders (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAEs) and their variants on learning semantically meaningful, disentangled representations is unparalleled. On the other hand, there are theoretical arguments suggesting impossibility of unsupervised disentanglement. In this work, we show that small perturbations of existing datasets hide the convenient correlation structure that is easily exploited by VAE-based architectures. To demonstrate this, we construct modified versions of the standard datasets on which (i) the generative factors are perfectly preserved; (ii) each image undergoes a transformation barely visible to the human eye; (iii) the leading disentanglement architectures fail to produce disentangled representations. We intend for these datasets to play a role in separating correlation-based models from those that discover the true causal structure.\n\nThe construction of the modifications is non-trivial and relies on recent progress on mechanistic understanding of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAEs and their connection to PCA, while also providing additional insights that might be of stand-alone interest.",
+    "title": "Clearing the Path for Truly Semantic Representation Learning",
+    "authors": [
+      "Dominik Zietlow",
+      "Michal Rolinek",
+      "Georg Martius"
+    ],
+    "emails": [
+      "~Georg_Martius1",
+      "~Dominik_Zietlow1",
+      "~Michal_Rolinek2"
+    ],
+    "rank": 2428
+  },
+  {
+    "url": "https://openreview.net/forum?id=lvXLfNeCQdK",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      3,
+      4
+    ],
+    "rating": "4.31",
+    "confidences": [
+      3,
+      5,
+      5
+    ],
+    "abstract": "In this paper, we study the problem of training certifiably robust models. Certifiable training minimizes an upper bound on the worst-case loss over the allowed perturbation, and thus the tightness of the upper bound is an important factor in building certifiably robust models. However, many studies have shown that Interval Bound Propagation (IBP) training uses much looser bounds but outperforms other models that use tighter bounds. We identify another key factor that influences the performance of certifiable training: \\textit{smoothness of the loss landscape}. We consider linear relaxation based methods and find significant differences in the loss landscape across these methods. Based on this analysis, we propose a certifiable training method that utilizes a tighter upper bound and has a landscape with favorable properties. The proposed method achieves performance comparable to state-of-the-art methods under a wide range of perturbations.",
+    "title": "Loss Landscape Matters: Training Certifiably Robust Models with Favorable Loss Landscape",
+    "authors": [
+      "Sungyoon Lee",
+      "Woojin Lee",
+      "Jinseong Park",
+      "Jaewook Lee"
+    ],
+    "emails": [
+      "~Woojin_Lee1",
+      "~Jaewook_Lee1",
+      "~Sungyoon_Lee1",
+      "~Jinseong_Park1"
+    ],
+    "rank": 2429
+  },
+  {
+    "url": "https://openreview.net/forum?id=TV9INIrmtWN",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.31",
+    "confidences": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "abstract": "Biological agents have adopted the principle of attention to limit the rate of incoming information from the environment. One question that arises is if an artificial agent has access to only a limited view of its surroundings, how can it control its attention to effectively solve tasks? We propose an approach for learning how to control a hard attention window by maximizing the mutual information between the environment state and the attention location at each step. The agent employs an internal world model to make predictions about its state and focuses attention towards where the predictions may be wrong. Attention is trained jointly with a dynamic memory architecture that stores partial observations and keeps track of the unobserved state. We demonstrate that our approach is effective in predicting the full state from a sequence of partial observations. We also show that the agent's internal representation of the surroundings, a live mental map, can be used for control in two partially observable reinforcement learning tasks. Videos of the trained agent can be found at ~\\url{<a href=\"https://sites.google.com/view/hard-attention-control}\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/hard-attention-control}</a>.",
+    "title": "Hard Attention Control By Mutual Information Maximization",
+    "authors": [
+      "Himanshu Sahni",
+      "Charles Lee Isbell"
+    ],
+    "emails": [
+      "~Himanshu_Sahni1",
+      "~Charles_Lee_Isbell1"
+    ],
+    "rank": 2430
+  },
+  {
+    "url": "https://openreview.net/forum?id=HkUfnZFt1Rw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      6
+    ],
+    "rating": "4.31",
+    "confidences": [
+      5,
+      5,
+      2,
+      4
+    ],
+    "abstract": "Graph measures can be used for graph node clustering using metric clustering algorithms. There are multiple measures applicable to this task, and which one performs better is an open question. We study the performance of 25 graph measures on generated graphs with different parameters. While usually measure comparisons are limited to general measure ranking on a particular dataset, we aim to explore the performance of various measures depending on graph features. Using an LFR generator, we create a dataset of ~7500 graphs covering the whole LFR parameter space. For each graph, we assess the quality of clustering with k-means algorithm for every considered measure. We determine the best measure for every area of the parameter space. We find that the parameter space consists of distinct zones where one particular measure is the best. We analyze the geometry of the resulting zones and describe it with simple criteria. Given particular graph parameters, this allows us to choose the best measure to use for clustering.",
+    "title": "Dissecting graph measures performance for node clustering in LFR parameter space",
+    "authors": [
+      "Vladimir Ivashkin",
+      "Pavel Chebotarev"
+    ],
+    "emails": [
+      "~Vladimir_Ivashkin1",
+      "gmail.com"
+    ],
+    "rank": 2431
+  },
+  {
+    "url": "https://openreview.net/forum?id=Whq-nTgCbNR",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.31",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "The paper addresses a problem of abnormalities detection in nonlinear processes represented by measured time series. Anomaly detection problem is usually formulated as finding outlier data points relative to some usual signals such as unexpected spikes, drops, or trend changes. In nonlinear dynamical systems, there are cases where a time series does not contain statistical outliers while the process corresponds to an abnormal configuration of the dynamical system. Since the polynomial neural architecture has a strong connection with the theory of differential equations, we use it for the feature extraction that describes the dynamical system itself. The paper discusses in both simulations and a practical example with real measurements the applicability of the proposed approach and it's benchmarking with existing methods.",
+    "title": "Anomaly detection in dynamical systems from measured time series",
+    "authors": [
+      "Andrei Ivanov",
+      "Anna Golovkina"
+    ],
+    "emails": [
+      "~Andrei_Ivanov1",
+      "spbu.ru"
+    ],
+    "rank": 2432
+  },
+  {
+    "url": "https://openreview.net/forum?id=AwPGPgExiYA",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      5
+    ],
+    "rating": "4.31",
+    "confidences": [
+      4,
+      2,
+      3,
+      4
+    ],
+    "abstract": "Logical rules inside a knowledge graph (KG) are essential for reasoning, logical inference, and rule mining. However, existing works can only handle simple, i.e., chain-like and tree-like, rules and cannot capture KG's complex semantics, which can be better captured by graph-like rules. Besides, learning graph-like rules is very difficult because the graph structure exhibits a huge discrete search space. To address these issues, observing that the plausibility of logical rules can be explained by how frequently it appears in a KG, we propose a score function that represents graph-like rules with learnable parameters. The score also helps relax the discrete space into a continuous one and can be uniformly transformed into matrix form by the Einstein summation convention. Thus, it allows us to learn graph-like rules in an efficient, differentiable, and end-to-end training manner by optimizing the normalized score. We conduct extensive experiments on real-world datasets to show that our method outperforms previous works due to logical rules' better expressive ability. Furthermore, we demonstrate that our method can learn high-quality and interpretable graph-like logical rules.",
+    "title": "Differentiable Learning of Graph-like Logical Rules from Knowledge Graphs",
+    "authors": [
+      "Hongzhi Shi",
+      "quanming yao",
+      "Yong Li"
+    ],
+    "emails": [
+      "~quanming_yao1",
+      "~Yong_Li3",
+      "~Hongzhi_Shi1"
+    ],
+    "rank": 2433
+  },
+  {
+    "url": "https://openreview.net/forum?id=lXoWPoi_40",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.31",
+    "confidences": [
+      3,
+      3,
+      2,
+      5
+    ],
+    "abstract": "Several papers argue that wide minima generalize better than narrow minima. In this paper, through detailed experiments that not only corroborate the generalization properties of wide minima, we also provide empirical evidence for a new hypothesis that the density of wide minima is likely lower than the density of narrow minima. Further, motivated by this hypothesis, we design a novel explore-exploit learning rate schedule. On a variety of image and natural language datasets, compared to their original hand-tuned learning rate baselines, we show that our explore-exploit schedule can result in either up to 0.84% higher absolute accuracy using the original training budget or up to 57% reduced training time while achieving the original reported accuracy. For example, we achieve state-of-the-art (SOTA) accuracy for IWSLT'14 (DE-EN) and WMT'14 (DE-EN) datasets by just modifying the learning rate schedule of a high performing model.",
+    "title": "Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule",
+    "authors": [
+      "Nikhil Iyer",
+      "V Thejas",
+      "Nipun Kwatra",
+      "Ramachandran Ramjee",
+      "Muthian Sivathanu"
+    ],
+    "emails": [
+      "~Ramachandran_Ramjee1",
+      "~Muthian_Sivathanu1",
+      "~Nikhil_Iyer1",
+      "gmail.com",
+      "~Nipun_Kwatra1"
+    ],
+    "rank": 2434
+  },
+  {
+    "url": "https://openreview.net/forum?id=NibHms070zC",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.31",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "In Federated Learning (FL), client devices collaboratively train a model without sharing the private data present on the devices. Federated Stochastic Gradient Descent (FedSGD) is a recent generalisation of the popular Federated Averaging algorithm. Recent works show that when client data is distributed heterogeneously, the loss function minimised by FedSGD differs from the 'true' loss that would be minimised by centralised training. Previous works propose decaying the client learning rate, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b3</mi></math></mjx-assistive-mml></mjx-container>, to allow FedSGD to minimise the true loss. We propose instead decaying the number of local SGD steps, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>, that clients perform during training rounds to allow minimisation of the true loss. Decaying <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> has the added benefit of reducing the total computation that clients perform during FedSGD. Real-world applications of FL use large numbers of low-powered smartphone or Internet-of-Things clients, so reduction of computation would provide significant savings in terms of energy and time. In this work, we prove for quadratic objectives that annealing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> allows FedSGD to approach the true minimiser. We then perform thorough experimentation on three benchmark FL datasets to show that decaying <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> can achieve the same generalisation performance as decaying <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FE TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b3</mi></math></mjx-assistive-mml></mjx-container>, but with up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3.8</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> less total steps of SGD performed by clients.\n",
+    "title": "Faster Federated Learning with Decaying Number of Local SGD Steps",
+    "authors": [
+      "Jed Mills",
+      "Jia Hu",
+      "Geyong Min"
+    ],
+    "emails": [
+      "exeter.ac.uk",
+      "~Jed_Mills1"
+    ],
+    "rank": 2435
+  },
+  {
+    "url": "https://openreview.net/forum?id=lkDjGgdZAMA",
+    "ratings": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.31",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Understanding what information neural networks capture is an essential problem in deep learning, and studying whether different models capture similar features is an initial step to achieve this goal. Previous works sought to define metrics over the feature matrices to measure the difference between two models. However, different metrics sometimes lead to contradictory conclusions, and there has been no consensus on which metric is suitable to use in practice. In this work, we propose a novel metric that goes beyond previous approaches. Recall that one of the most practical scenarios of using the learned representations is to apply them to downstream tasks. We argue that we should design the metric based on a similar principle. For that, we introduce the transferred discrepancy (TD), a new metric that defines the difference between two representations based on their downstream-task performance. Through an asymptotic analysis, we show how TD correlates with downstream tasks and the necessity to define metrics in such a task-dependent fashion.  In particular, we also show that under specific conditions, the TD metric is closely related to previous metrics. Our experiments show that TD can provide fine-grained information for varied downstream tasks, and for the models trained from different initializations, the learned features are not the same in terms of downstream-task predictions. We find that TD may also be used to evaluate the effectiveness of different training strategies. For example, we demonstrate that the models trained with proper data augmentations that improve the generalization capture more similar features in terms of TD, while those with data augmentations that hurt the generalization will not. This suggests a training strategy that leads to more robust representation also trains models that generalize better. ",
+    "title": "Transferred Discrepancy: Quantifying the Difference Between Representations",
+    "authors": [
+      "Yunzhen Feng",
+      "Runtian Zhai",
+      "Di He",
+      "Liwei Wang",
+      "Bin Dong"
+    ],
+    "emails": [
+      "~Di_He1",
+      "~Bin_Dong1",
+      "~Runtian_Zhai1",
+      "~Yunzhen_Feng1",
+      "~Liwei_Wang1"
+    ],
+    "rank": 2436
+  },
+  {
+    "url": "https://openreview.net/forum?id=D4A-v0kltaX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.31",
+    "confidences": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We present a lightweighted neural PDE representation to discover the hidden structure and predict the solution of different nonlinear PDEs. Our key idea is to leverage the prior of ``\"translational similarity\" of numerical PDE differential operators to drastically reduce the scale of learning model and training data. We implemented three central network components, including a neural functional convolution operator, a Picard forward iterative procedure, and an adjoint backward gradient calculator. Our novel paradigm fully leverages the multifaceted priors that stem from the sparse and smooth nature of the physical PDE solution manifold and the various mature numerical techniques such as adjoint solver, linearization, and iterative procedure to accelerate the computation. We demonstrate the efficacy of our method by robustly discovering the model and accurately predicting the solutions of various types of PDEs with small-scale networks and training sets. We highlight that all the PDE examples we showed were trained with up to 8 data samples and within 325 network parameters. ",
+    "title": "Neural Partial Differential Equations with Functional Convolution",
+    "authors": [
+      "Ziqian Wu",
+      "Xingzhe He",
+      "Michael Zhang",
+      "Yijun Li",
+      "Cheng Yang",
+      "Rui Liu",
+      "Shiying Xiong",
+      "Bo Zhu"
+    ],
+    "emails": [
+      "~Ziqian_Wu1",
+      "~Rui_Liu7",
+      "~Bo_Zhu2",
+      "~Cheng_Yang4",
+      "~Shiying_Xiong1",
+      "~Xingzhe_He1",
+      "~Yijun_Li4",
+      "~Michael_Zhang6"
+    ],
+    "rank": 2437
+  },
+  {
+    "url": "https://openreview.net/forum?id=b6BdrqTnFs7",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.31",
+    "confidences": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we present a compositional generalization approach in grounded agent instruction learning. Compositional generalization is an important part of human intelligence, but current neural network models do not have such ability. This is more complicated in multi-modal problems with grounding. Our proposed approach has two main ideas. First, we use interactions between agent and the environment to find components in the output. Second, we apply entropy regularization to learn corresponding input components for each output component. The results show the proposed approach significantly outperforms baselines in most tasks, with more than 25% absolute average accuracy increase. We also investigate the impact of entropy regularization and other changes with ablation study. We hope this work is the first step to address grounded compositional generalization, and it will be helpful in advancing artificial intelligence research.\n",
+    "title": "Grounded Compositional Generalization with Environment Interactions",
+    "authors": [
+      "Yuanpeng Li"
+    ],
+    "emails": [
+      "~Yuanpeng_Li2"
+    ],
+    "rank": 2438
+  },
+  {
+    "url": "https://openreview.net/forum?id=ysXk8cCHcQN",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      6
+    ],
+    "rating": "4.30",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "Acoustic properties of objects corresponding to scattering characteristics are frequently used for 3D audio content creation,  environmental acoustic effects, localization and acoustic scene analysis, etc. The numeric solvers used to compute these acoustic properties are too slow for interactive applications. We present a novel geometric deep learning algorithm based on discrete-laplacian and implicit encoders to compute these characteristics for rigid or deformable objects at interactive rates. We use a point cloud approximation of each object, and each point is encoded in a high-dimensional latent space. Our multi-layer network can accurately estimate these acoustic properties for arbitrary topologies and takes less than 1ms per object on a NVIDIA GeForce RTX 2080 Ti GPU. We also prove that our learning method is permutation and rotation invariant and demonstrate high accuracy on objects that are quite different from the training data. We highlight its application to generating environmental acoustic effects in dynamic environments.",
+    "title": "Fast 3D Acoustic Scattering via Discrete Laplacian Based Implicit Function Encoders",
+    "authors": [
+      "Hsien-Yu Meng",
+      "Zhenyu Tang",
+      "Dinesh Manocha"
+    ],
+    "emails": [
+      "~Hsien-Yu_Meng1",
+      "~Zhenyu_Tang4",
+      "~Dinesh_Manocha2"
+    ],
+    "rank": 2439
+  },
+  {
+    "url": "https://openreview.net/forum?id=8Xi5MLFE_IW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.30",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "In model-based learning, an agent\u2019s model is commonly defined over transitions between consecutive states of an environment even though planning often requires reasoning over multi-step timescales, with intermediate states either unnecessary, or worse, accumulating prediction error. In contrast, intelligent behaviour in biological organisms is characterised by the ability to plan over varying temporal scales depending on the context. Inspired by the recent works on human time perception, we devise a novel approach to learning a transition dynamics model, based on the sequences of episodic memories that define the agent's subjective timescale \u2013 over which it learns world dynamics and over which future planning is performed. We implement this in the framework of active inference and demonstrate that the resulting subjective-timescale model (STM) can systematically vary the temporal extent of its predictions while preserving the same computational efficiency. Additionally, we show that STM predictions are more likely to introduce future salient events (for example new objects coming into view), incentivising exploration of new areas of the environment. As a result, STM produces more informative action-conditioned roll-outs that assist the agent in making better decisions. We validate significant improvement in our STM agent's performance in the Animal-AI environment against a baseline system, trained using the environment's objective-timescale dynamics.",
+    "title": "Episodic Memory for Learning Subjective-Timescale Models",
+    "authors": [
+      "Alexey Zakharov",
+      "Matthew Crosby",
+      "Zafeirios Fountas"
+    ],
+    "emails": [
+      "~Alexey_Zakharov1",
+      "imperial.ac.uk",
+      "~Zafeirios_Fountas1"
+    ],
+    "rank": 2440
+  },
+  {
+    "url": "https://openreview.net/forum?id=KTS3QeWxRQq",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.30",
+    "confidences": [
+      4,
+      1,
+      2,
+      3
+    ],
+    "abstract": "Variational autoencoder (VAE)  estimates the posterior parameters (mean and variance) of latent variables corresponding to each input data. While it is used for many tasks, the transparency of the model is still an underlying issue. This paper provides a quantitative understanding of VAE property by interpreting VAE as a non-linearly scaled isometric embedding. According to the Rate-distortion theory, the optimal transform coding is achieved by using a PCA-like orthonormal transform where the transform space is isometric to the input. From this analogy, we show theoretically and experimentally that VAE can be mapped to an implicit isometric embedding with a scale factor derived from the posterior parameter. As a result, we can estimate the data probabilities in the input space from the prior, loss metrics, and corresponding posterior parameters. In addition, the quantitative importance of each latent variable can be evaluated like the eigenvalue of PCA.\n",
+    "title": "Quantitative Understanding of VAE as a Non-linearly Scaled Isometric Embedding",
+    "authors": [
+      "Akira Nakagawa",
+      "Keizo Kato"
+    ],
+    "emails": [
+      "~Akira_Nakagawa1",
+      "~Keizo_Kato1"
+    ],
+    "rank": 2441
+  },
+  {
+    "url": "https://openreview.net/forum?id=UkxdauhUYnu",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.30",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "To classify images, neural networks extract features from raw inputs and then sum them up with fixed weights via the fully connected layer. However, the weights are fixed despite the input types. Such fixed prior limits networks' flexibility in adjusting feature reliance, which in turn enables attackers to flip networks' predictions by corrupting the most brittle features whose value would change drastically by minor perturbations. Inspired by the analysis, we replace the original fixed fully connected layer by dynamically calculating the posterior weight for each feature according to the input and connections between them. Also, a counterfactual baseline is integrated to precisely characterize the credit of each feature's contribution to the robustness and generality of the model. We empirically demonstrate that the proposed algorithm improves both standard and robust error against several strong attacks across various major benchmarks.  Finally, we theoretically prove the minimal structure requirement for our framework to improve adversarial robustness in a fairly simple and natural setting.",
+    "title": "No Feature Is An Island: Adaptive Collaborations Between Features Improve Adversarial Robustness",
+    "authors": [
+      "Yufeng Zhang",
+      "Yunan Zhang",
+      "ChengXiang Zhai"
+    ],
+    "emails": [
+      "~Yunan_Zhang1",
+      "~ChengXiang_Zhai1",
+      "bytedance.com"
+    ],
+    "rank": 2442
+  },
+  {
+    "url": "https://openreview.net/forum?id=me5hEszKra4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.30",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "The rapid advancements of computing technology facilitate the development of diverse deep learning applications. Unfortunately, the efficiency of parallel computing infrastructures varies widely with neural network models, which hinders the exploration of the design space to find high-performance neural network architectures on specific computing platforms for a given application. To address such a challenge, we propose a deep learning-based method, ResPerfNet, which trains a residual neural network with representative datasets obtained on the target platform to predict the performance for a deep neural network. Our experimental results show that ResPerfNet can accurately predict the execution time of individual neural network layers and full network models on a variety of platforms. In particular, ResPerfNet achieves 8.4% of mean absolute percentage error for LeNet, AlexNet and VGG16 on the NVIDIA GTX 1080Ti, which is substantially lower than the previously published works.",
+    "title": "ResPerfNet: Deep Residual Learning for Regressional Performance Modeling of Deep Neural Networks",
+    "authors": [
+      "Chuan-Chi Wang",
+      "Ying-Chiao Liao",
+      "Chia-Heng Tu",
+      "Ming-Chang Kao",
+      "Wen-Yew Liang",
+      "Shih-Hao Hung"
+    ],
+    "emails": [
+      "adlinktech.com",
+      "~Ying-Chiao_Liao1",
+      "~Shih-Hao_Hung1",
+      "ncku.edu.tw"
+    ],
+    "rank": 2443
+  },
+  {
+    "url": "https://openreview.net/forum?id=2fadDWoYCUy",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.30",
+    "confidences": [
+      3,
+      3,
+      4
+    ],
+    "abstract": "This work aims to enable efficient on-device contrastive learning from input streaming data after a model is deployed on edge devices such as robots or unmanned aerial vehicles (UAVs) so that they can adapt to a dynamic new environment for higher accuracy. On the other hand, such data usually does not have any labels, calling for unsupervised learning. Most recently, contrastive learning has demonstrated its great potential in learning visual representation from unlabeled data. However, directly applying it to streaming data requires storing a large dataset on-the-fly, which will quickly drain edge devices\u2019 storage resources. In this paper, we propose a framework to automatically select the most representative data from unlabeled input stream on-the-fly, which only requires the use of a small data buffer for dynamic learning. What is more, considering the fact that the data are not independent and identically distributed (iid) as in the traditional training process, we score new data as they come in by measuring the quality of their representations without requiring any label information, based on which the data in the buffer will be updated. Extensive experiments show that the learning speed and accuracy are greatly improved compared with approaches without data selection.",
+    "title": "Enabling Efficient On-Device Self-supervised Contrastive Learning by Data Selection",
+    "authors": [
+      "Yawen Wu",
+      "Zhepeng Wang",
+      "Dewen Zeng",
+      "Yiyu Shi",
+      "Jingtong Hu"
+    ],
+    "emails": [
+      "pitt.edu",
+      "~Dewen_Zeng1",
+      "~Yawen_Wu1",
+      "~Jingtong_Hu1",
+      "~Yiyu_Shi1"
+    ],
+    "rank": 2444
+  },
+  {
+    "url": "https://openreview.net/forum?id=4SiMia0kjba",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      6,
+      5,
+      2
+    ],
+    "rating": "4.29",
+    "confidences": [
+      2,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Achieving accurate spatio-temporal predictions in large-scale systems is extremely valuable in many real-world applications, such as weather forecasts, retail forecasting, and urban traffic forecasting. So far, most existing methods for multi-horizon, multi-task and multi-target predictions select important predicting variables via their correlations with responses, and thus it is highly possible that many forecasting models generated from those methods are not causal, leading to poor interpretability. The aim of this paper is to develop a collaborative causal spatio-temporal fusion transformer, named CausalTrans, to establish the collaborative causal effects of predictors on multiple forecasting targets, such as supply and demand in ride-sharing platforms. Specifically, we integrate the causal attention with the Conditional Average Treatment Effect (CATE) estimation method for causal inference. Moreover, we propose a novel and fast multi-head attention evolved from Taylor expansion instead of softmax, reducing time complexity from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c56 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em; margin-left: 0.052em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><msup><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">V</mi></mrow><mn>2</mn></msup><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c56 TEX-C\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">V</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c56 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">V</mi></mrow></math></mjx-assistive-mml></mjx-container> is the number of nodes in a graph. We further design a spatial graph fusion mechanism to significantly reduce the parameters' scale. We conduct a wide range of experiments to demonstrate the interpretability of causal attention, the effectiveness of various model components, and the time efficiency of our CausalTrans. As shown in these experiments, our CausalTrans framework can achieve up to 15<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> error reduction compared with various baseline methods. ",
+    "title": "Causal Probabilistic Spatio-temporal Fusion Transformers in Two-sided Ride-Hailing Markets",
+    "authors": [
+      "Shixiang Wan",
+      "Shikai Luo",
+      "Hongtu Zhu"
+    ],
+    "emails": [
+      "~Shixiang_Wan1",
+      "~Hongtu_Zhu2",
+      "~Shikai_Luo1"
+    ],
+    "rank": 2445
+  },
+  {
+    "url": "https://openreview.net/forum?id=5g5x0eVdRg",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      4
+    ],
+    "rating": "4.29",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Unsupervised learning of categorical representations using data augmentations appears to be a promising approach and has proven useful for finding suitable representations for downstream tasks. However current state-of-the-art methods require preprocessing (e.g. Sobel edge detection) to work. We introduce a mutual information minimization strategy for unsupervised learning from augmentations, that prevents learning from locking on to easy to find, yet unimportant, representations at the expense of more informative ones requiring more complex processing. We demonstrate specifically that this process learns representations which capture higher mutual information between augmentations, and demonstrate that these representations are better suited to the downstream exemplar task of clustering. We obtain substantial accuracy improvements on CIFAR-10, CIFAR-100-20, and SVHN.",
+    "title": "DHOG: Deep Hierarchical Object Grouping",
+    "authors": [
+      "Luke Nicholas Darlow",
+      "Amos Storkey"
+    ],
+    "emails": [
+      "~Amos_Storkey1",
+      "~Luke_Nicholas_Darlow1"
+    ],
+    "rank": 2446
+  },
+  {
+    "url": "https://openreview.net/forum?id=RCGBA1i5MF",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      2,
+      5
+    ],
+    "rating": "4.29",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Long-tailed visual recognition poses significant challenges to traditional machine learning and emerging deep networks due to its inherent class imbalance. A common belief is that tail classes with few samples cannot exhibit enough regularity for pattern extraction. What makes things worse, the limited cardinality may lead to low exposure of tail classes in the training stage. Re-sampling methods, especially those who naively enlarge the exposure frequency, eventually fail with head classes under-represented and tail classes overfitted.\nArguing that long-tailed learning involves a trade-off between head class pattern extraction and tail class memorizing, we first empirically identify the regularity of classes under long-tailed distributions and find that regularity of the same training samples will be sharply decreased with the reduction of class cardinality. Motivated by the recent success of a series works on the memorization-generalization mechanism, we propose a simple yet effective training strategy by switching from instance-balanced sampling to class-reversed sampling to memorize tail classes without seriously damaging the representation of head classes. Closely after- wards, we give the theoretical generalization error upper bound to prove that class- reversed sampling is better than instance-balanced sampling during the last train- ing stage. In our experiments, the proposed method can reach the state-of-the-art performance more efficiently than current methods, on several datasets. Further experiments also validate the superior performance of the proposed sampling strategy, implying that the long-tailed learning trade-off could be effectively tackled only in the memorization stage with a small learning rate and over-exposure of tail samples.",
+    "title": "The Unreasonable Effectiveness of the Class-reversed Sampling in Tail Sample Memorization",
+    "authors": [
+      "Benyi Hu",
+      "Chi Zhang",
+      "Yuehu Liu",
+      "Le Wang",
+      "Li Liu"
+    ],
+    "emails": [
+      "~Li_Liu12",
+      "~Le_Wang1",
+      "~Yuehu_Liu1",
+      "~Benyi_Hu1",
+      "stu.xjtu.edu.cn"
+    ],
+    "rank": 2447
+  },
+  {
+    "url": "https://openreview.net/forum?id=X6_vet6HWX",
+    "ratings": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.29",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Input dimensions are unnecessary for a given task when the target function can be expressed without such dimensions. Object's background in image recognition or redundant sentences in text classification are examples of unnecessary dimensions that are often present in datasets. Deep neural networks achieve remarkable generalization performance despite the presence of unnecessary dimensions but it is unclear whether these dimensions negatively affect neural networks or how. In this paper, we investigate the impact of unnecessary input dimensions on one of the central issues of machine learning: the number of training examples needed to achieve high generalization performance, which we refer to as the network's data efficiency. In a series of analyses with multi-layer perceptrons and deep convolutional neural networks, we show that the network's data efficiency depends on whether the unnecessary dimensions are \\emph{task-unrelated} or \\emph{task-related} (unnecessary due to redundancy). Namely, we demonstrate that increasing the number of \\emph{task-unrelated} dimensions leads to an incorrect inductive bias and as a result degrade the data efficiency, while increasing the number of \\emph{task-related} dimensions helps to alleviate the negative impact of the \\emph{task-unrelated} dimensions. These results highlight the need for mechanisms that remove \\emph{task-unrelated} dimensions, such as crops or foveation for image classification, to enable data efficiency gains.\n",
+    "title": "The Foes of Neural Network\u2019s Data Efficiency Among Unnecessary Input Dimensions",
+    "authors": [
+      "Vanessa D'Amario",
+      "Sanjana Srivastava",
+      "Tomotake Sasaki",
+      "Xavier Boix"
+    ],
+    "emails": [
+      "~Xavier_Boix1",
+      "~Tomotake_Sasaki1",
+      "~Sanjana_Srivastava2",
+      "~Vanessa_D'Amario1"
+    ],
+    "rank": 2448
+  },
+  {
+    "url": "https://openreview.net/forum?id=ES9cpVTyLL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.29",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Coherent Gradients (CGH) [Chatterjee, ICLR 20] is a recently proposed hypothesis to explain why over-parameterized neural networks trained with gradient descent generalize well even though they have sufficient capacity to memorize the training set. The key insight of CGH is that, since the overall gradient for a single step of SGD is the sum of the per-example gradients, it is strongest in directions that reduce the loss on multiple examples if such directions exist. In this paper, we validate CGH on ResNet, Inception, and VGG models on ImageNet. Since the techniques presented in the original paper do not scale beyond toy models and datasets, we propose new methods. By posing the problem of suppressing weak gradient directions as a problem of robust mean estimation, we develop a coordinate-based median of means approach. We present two versions of this algorithm, M3, which partitions a mini-batch into 3 groups and computes the median, and a more efficient version RM3, which reuses gradients from previous two time steps to compute the median. Since they suppress weak gradient directions without requiring per-example gradients, they can be used to train models at scale. Experimentally, we find that they indeed greatly reduce overfitting (and memorization) and thus provide the first convincing evidence that CGH holds at scale. We also propose a new test of CGH that does not depend on adding noise to training labels or on suppressing weak gradient directions. Using the intuition behind CGH, we posit that the examples learned early in the training process (i.e., \"easy\" examples) are precisely those that have more in common with other training examples. Therefore, as per CGH, the easy examples should generalize better amongst themselves than the hard examples amongst themselves. We validate this hypothesis with detailed experiments, and believe that it provides further orthogonal evidence for CGH.",
+    "title": "Weak and Strong Gradient Directions: Explaining Memorization, Generalization, and Hardness of Examples at Scale",
+    "authors": [
+      "Piotr Zielinski",
+      "Shankar Krishnan",
+      "Satrajit Chatterjee"
+    ],
+    "emails": [
+      "~Shankar_Krishnan1",
+      "google.com",
+      "~Satrajit_Chatterjee1"
+    ],
+    "rank": 2449
+  },
+  {
+    "url": "https://openreview.net/forum?id=9hgEG-k57Zj",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4,
+      6
+    ],
+    "rating": "4.29",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Recent progress in offline reinforcement learning (RL) has made it possible to train strong RL agents from previously-collected, static datasets. However, depending on the quality of the trained agents and the application being considered, it is often desirable to improve such offline RL agents with further online interaction. As it turns out, fine-tuning offline RL agents is a non-trivial challenge, due to distribution shift \u2013 the agent encounters out-of-distribution samples during online interaction, which may cause bootstrapping error in Q-learning and instability during fine-tuning. In order to address the issue, we present a simple yet effective framework, which incorporates a balanced replay scheme and an ensemble distillation scheme. First, we propose to keep separate offline and online replay buffers, and carefully balance the number of samples from each buffer during updates. By utilizing samples from a wider distribution, i.e., both online and offline samples, we stabilize the Q-learning. Next, we present an ensemble distillation scheme, where we train an ensemble of independent actor-critic agents, then distill the policies into a single policy. In turn, we improve the policy using the Q-ensemble during fine-tuning, which allows the policy updates to be more robust to error in each individual Q-function. We demonstrate the superiority of our method on MuJoCo datasets from the recently proposed D4RL benchmark suite.\n",
+    "title": "Addressing Distribution Shift in Online Reinforcement Learning with Offline Datasets",
+    "authors": [
+      "Seunghyun Lee",
+      "Younggyo Seo",
+      "Kimin Lee",
+      "Pieter Abbeel",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Kimin_Lee1",
+      "~Seunghyun_Lee2",
+      "~Jinwoo_Shin1",
+      "~Pieter_Abbeel2",
+      "~Younggyo_Seo1"
+    ],
+    "rank": 2450
+  },
+  {
+    "url": "https://openreview.net/forum?id=hPDC6tBFNiV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.29",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Quantifying uncertainty is critical to risk assessment and decision making in high stakes domains. However, prior works for deep neural network uncertainty estimation have mostly focused on point prediction. A systematic study of uncertainty quantification methods for spatiotemporal forecasting has been missing in the community. In this paper, we analyze forecasting uncertainty in spatiotemporal sequences from both the Bayesian and Frequentist point of view via statistical decision theory. We further conduct case studies to provide an empirical comparison of Bayesian and Frequentist uncertainty estimation techniques. Through experiments on traffic and COVID-19 forecasts, we conclude that, with a limited amount of computation, Bayesian methods are typically more robust in mean prediction, while Frequentist methods are more effective in estimating the confidence levels.",
+    "title": "Quantifying Uncertainty in Deep Spatiotemporal Forecasting",
+    "authors": [
+      "Dongxia Wu",
+      "Liyao Gao",
+      "Xinyue Xiong",
+      "Matteo Chinazzi",
+      "Alessandro Vespignani",
+      "Yian Ma",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Dongxia_Wu1",
+      "~Yian_Ma1",
+      "~Rose_Yu1",
+      "~Liyao_Gao2",
+      "northeastern.edu",
+      "~Matteo_Chinazzi1",
+      "~Alessandro_Vespignani1"
+    ],
+    "rank": 2451
+  },
+  {
+    "url": "https://openreview.net/forum?id=0hMthVxlS89",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.29",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We prove under commonly used assumptions the convergence of actor-critic reinforcement learning algorithms, which simultaneously learn a policy function, the actor, and a value function, the critic. Both functions can be deep neural networks of arbitrary complexity. Our framework allows showing convergence of the well known Proximal Policy Optimization (PPO) and of the recently introduced RUDDER. For the convergence proof  we employ recently introduced techniques from the two time-scale stochastic approximation theory. Our results are valid for actor-critic methods that use episodic samples and that have a policy that becomes more greedy during learning. Previous convergence proofs assume linear function approximation, cannot treat episodic examples, or do not consider that policies become greedy. The latter is relevant since optimal policies are typically deterministic. ",
+    "title": "Convergence Proof for Actor-Critic Methods Applied to PPO and RUDDER",
+    "authors": [
+      "Markus Holzleitner",
+      "Lukas Gruber",
+      "Jose Arjona-Medina",
+      "Johannes Brandstetter",
+      "Sepp Hochreiter"
+    ],
+    "emails": [
+      "~Markus_Holzleitner1",
+      "~Johannes_Brandstetter1",
+      "~Lukas_Gruber2",
+      "~Jose_Arjona-Medina1",
+      "~Sepp_Hochreiter1"
+    ],
+    "rank": 2452
+  },
+  {
+    "url": "https://openreview.net/forum?id=xBoKLdKrZd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.29",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": " Neural architecture search (NAS) has recently received extensive attention due to its effectiveness in automatically designing effective neural architectures. A major challenge in NAS is to conduct a fast and accurate evaluation of neural architectures. Commonly used fast architecture evaluators include parameter-sharing ones and predictor-based ones. Despite their high evaluation efficiency, the evaluation correlation (especially of the well-performing architectures) is still questionable. In this paper, we conduct an extensive assessment of both the parameter-sharing and predictor-based evaluators on the NAS-Bench-201 search space, and break up how and why different configurations and strategies influence the fitness of the evaluators. Specifically, we carefully develop a set of NAS-oriented criteria to understand the behavior of fast architecture evaluators in different training stages. And based on the findings of our experiments, we give pieces of knowledge and suggestions to guide NAS application and motivate further research.",
+    "title": "A Surgery of the Neural Architecture Evaluators",
+    "authors": [
+      "Xuefei Ning",
+      "Wenshuo Li",
+      "Zixuan Zhou",
+      "Tianchen Zhao",
+      "Shuang Liang",
+      "Yin Zheng",
+      "Huazhong Yang",
+      "Yu Wang"
+    ],
+    "emails": [
+      "~Tianchen_Zhao2",
+      "~Wenshuo_Li1",
+      "mails.tsinghua.edu.cn",
+      "~Yin_Zheng1",
+      "~Huazhong_Yang1",
+      "~Yu_Wang3",
+      "~Xuefei_Ning1",
+      "novauto.com.cn"
+    ],
+    "rank": 2453
+  },
+  {
+    "url": "https://openreview.net/forum?id=bIwkmDnSeu",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.29",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Adversarial imitation learning has emerged as a general and scalable framework for automatic reward acquisition. However, we point out that previous methods commonly exploited occupancy-dependent reward learning formulation\u2014which hinders the reconstruction of optimal decision as an energy-based model. Despite the theoretical justification, the occupancy measures tend to cause issues in practice because of high variance and low vulnerability to domain shifts. Another reported problem is termination biases induced by provided rewarding and regularization schemes around terminal states. In order to deal with these issues, this work presents a novel algorithm called causal adversarial inverse reinforcement learning. Our formulation draws a strong connection between adversarial learning and energy-based reinforcement learning; thus, the architecture is capable of recovering a reward function that induces a multi-modal policy. In experiments, we demonstrate that our approach outperforms prior methods in challenging continuous control tasks, even under significant variation in the environments.",
+    "title": "Unbiased Learning with State-Conditioned Rewards in Adversarial Imitation Learning",
+    "authors": [
+      "Dong-Sig Han",
+      "Hyunseo Kim",
+      "Hyundo Lee",
+      "Je-Hwan Ryu",
+      "Byoung-Tak Zhang"
+    ],
+    "emails": [
+      "~Byoung-Tak_Zhang1",
+      "~Dong-Sig_Han2",
+      "bi.snu.ac.kr"
+    ],
+    "rank": 2454
+  },
+  {
+    "url": "https://openreview.net/forum?id=Wis-_MNpr4",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.29",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Protecting the privacy of input data is of growing importance as machine learning methods reach new application domains.\nIn this paper, we provide a unified training and inference framework for large DNNs while protecting input privacy and computation integrity. Our approach called DarKnight uses a novel data blinding strategy using matrix masking to create input obfuscation within a trusted execution environment (TEE). Our rigorous mathematical proof demonstrates that our blinding process provides an information-theoretic privacy guarantee by bounding information leakage. The obfuscated data can then be offloaded to any GPU for accelerating linear operations on blinded data. The results from linear operations on blinded data are decoded before performing non-linear operations within the TEE. This cooperative execution allows DarKnight to exploit the computational power of GPUs to perform linear operations while exploiting TEEs to protect input privacy.  We implement DarKnight on an Intel SGX TEE augmented with a GPU to evaluate its performance.",
+    "title": "DarKnight: A Data Privacy Scheme for Training and Inference of Deep Neural Networks",
+    "authors": [
+      "Hanieh Hashemi",
+      "Yongqin Wang",
+      "Murali Annavaram"
+    ],
+    "emails": [
+      "~Murali_Annavaram1",
+      "~Hanieh_Hashemi1",
+      "usc.edu"
+    ],
+    "rank": 2455
+  },
+  {
+    "url": "https://openreview.net/forum?id=BvrKnFq_454",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.27",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Many popular adaptive gradient methods such as Adam and RMSProp rely on an exponential moving average (EMA) to normalize their stepsizes. While the EMA makes these methods highly responsive to new gradient information, recent research has shown that it also causes divergence on at least one convex optimization problem. We propose a novel method called Expectigrad, which adjusts stepsizes according to a per-component unweighted mean of all historical gradients and computes a bias-corrected momentum term jointly between the numerator and denominator. We prove that Expectigrad cannot diverge on every instance of the optimization problem known to cause Adam to diverge. We also establish a regret bound in the general stochastic nonconvex setting that suggests Expectigrad is less susceptible to gradient variance than existing methods are. Testing Expectigrad on several high-dimensional machine learning tasks, we find it often performs favorably to state-of-the-art methods with little hyperparameter tuning.",
+    "title": "Expectigrad: Fast Stochastic Optimization with Robust Convergence Properties",
+    "authors": [
+      "Brett Daley",
+      "Christopher Amato"
+    ],
+    "emails": [
+      "~Christopher_Amato1",
+      "~Brett_Daley1"
+    ],
+    "rank": 2456
+  },
+  {
+    "url": "https://openreview.net/forum?id=4LHz4IFGLQ-",
+    "ratings": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.27",
+    "confidences": [
+      3,
+      2,
+      3,
+      3
+    ],
+    "abstract": "We propose an unsupervised neural model for learning a discrete embedding of words.\nUnlike existing discrete embeddings, our binary embedding supports vector arithmetic operations similar to continuous embeddings.\nOur embedding represents each word as a set of propositional statements describing a transition rule in classical/STRIPS planning formalism.\nThis makes the embedding directly compatible with symbolic, state of the art classical planning solvers.",
+    "title": "Discrete Word Embedding for Logical Natural Language Understanding",
+    "authors": [
+      "Zilu Tang",
+      "Masataro Asai"
+    ],
+    "emails": [
+      "~Zilu_Tang1",
+      "~Masataro_Asai1"
+    ],
+    "rank": 2457
+  },
+  {
+    "url": "https://openreview.net/forum?id=0jqRSnFnmL_",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Directed acyclic graphs (DAGs) are widely used to model the casual relationships among random variables in many disciplines. One major class of algorithms for DAGs is  called `search-and-score', which attempts to maximize some goodness-of-fit measure and returns a DAG with the best score. However, most existing methods highly rely on their model assumptions and cannot be applied to the more general real-world problems. This paper proposes a  novel Reinforcement-Learning-based searching algorithm, Alpha-DAG, which gradually finds the optimal order to add edges by learning from the historical searching trajectories. At each decision window, the agent adds the edge with the largest scoring improvement to the current graph. The advantage of Alpha-DAG is supported by the numerical comparison against some state-of-the-art competitors in both synthetic and real examples.",
+    "title": "Alpha-DAG: a reinforcement learning based algorithm to learn Directed Acyclic Graphs",
+    "authors": [
+      "Fan Zhou",
+      "Yifeng Pan",
+      "Shenghua Zhu",
+      "Xin HE"
+    ],
+    "emails": [
+      "~Fan_Zhou7",
+      "~Yifeng_Pan3",
+      "~Shenghua_Zhu1",
+      "~Xin_HE6"
+    ],
+    "rank": 2458
+  },
+  {
+    "url": "https://openreview.net/forum?id=8_7yhptEWD",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Existing analyses of the neural tangent kernel (NTK) for infinite-depth networks show that the kernel typically becomes degenerate as the number of layers grows.  This raises the question of how to apply such methods to practical \"infinite depth\" architectures such as the recently-proposed deep equilibrium (DEQ) model, which directly computes the infinite-depth limit of a weight-tied network via root-finding.  In this work, we show that because of the input injection component of these networks, DEQ models have non-degenerate NTKs even in the infinite depth limit.  Furthermore, we show that these kernels themselves can be computed by an analogous root-finding problem as in traditional DEQs, and highlight methods for computing the NTK for both fully-connected and convolutional variants.  We evaluate these models empirically, showing they match or improve upon the performance of existing regularized NTK methods.",
+    "title": "On the Neural Tangent Kernel of Equilibrium Models",
+    "authors": [
+      "Zhili Feng",
+      "J Zico Kolter"
+    ],
+    "emails": [
+      "~Zhili_Feng1",
+      "~J_Zico_Kolter1"
+    ],
+    "rank": 2459
+  },
+  {
+    "url": "https://openreview.net/forum?id=c5klJN-Bpq1",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Can we generalize and improve the representation power of tree models? Tree models are often favored over deep neural networks due to their interpretable structures in problems where the interpretability is required, such as in the classification of feature-based data where each feature is meaningful. However, most tree models have low accuracies and easily overfit to training data. In this work, we propose Decision Transformer Network (DTN), our highly accurate and interpretable tree model based on our generalized framework of tree models, decision transformers. Decision transformers allow us to describe tree models in the context of deep learning. Our DTN is proposed based on improving the generalizable components of the decision transformer, which increases the representation power of tree models while preserving the inherent interpretability of the tree structure. Our extensive experiments on 121 feature-based datasets show that DTN outperforms the state-of-the-art tree models and even deep neural networks.",
+    "title": "Generalizing Tree Models for Improving Prediction Accuracy",
+    "authors": [
+      "Jaemin Yoo",
+      "Lee Sael"
+    ],
+    "emails": [
+      "~Jaemin_Yoo1",
+      "ajou.ac.kr"
+    ],
+    "rank": 2460
+  },
+  {
+    "url": "https://openreview.net/forum?id=qyVbUbYL2C",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Recently, training convolutional neural networks (CNNs) using blurry images has been identified as a potential means to produce more robust models for facial recognition (Vogelsang et al. 2018). This method of training is intended to mimic biological visual development, as human visual acuity develops from near-blindness to normal acuity in the first three to four months of life (Kugelberg 1992). Object recognition develops in tandem during this time, and this developmental period has been shown to be critical for many visual tasks in later childhood and adulthood. We explore the effects of training CNNs on images with different levels of applied blur, including training regimens with progressively less blurry training sets. Using subsets of ImageNet (Russakovsky 2015), CNN performance is evaluated for both broad object recognition and fine-grained classification tasks. Results for AlexNet (Krizhevsky et al. 2012) and the more compact SqueezeNet (Iandola et al. 2016) are compared. Using blurry images for training on their own or as part of a training sequence increases classification accuracy across collections of images with different resolutions. At the same time, blurry training data causes little change to training convergence time and false positive classification certainty. Our findings support the utility of learning from sequences of blurry images for more robust image recognition.",
+    "title": "Modeling Human Development: Effects of Blurred Vision on Category Learning in CNNs",
+    "authors": [
+      "William Charles",
+      "Daniel Leeds"
+    ],
+    "emails": [
+      "~William_Charles1",
+      "fordham.edu"
+    ],
+    "rank": 2461
+  },
+  {
+    "url": "https://openreview.net/forum?id=MbG7JBt0Yvo",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      3
+    ],
+    "rating": "4.27",
+    "confidences": [
+      3,
+      5,
+      3
+    ],
+    "abstract": "Sequence metric learning is becoming a widely adopted approach for various applications dealing with sequential multi-variate data such as activity recognition or natural language processing and is most of the time tackled with sequence alignment approaches or representation learning. \nIn this paper, we propose to study this subject from the point of view of dynamical system theory by drawing the analogy between synchronized trajectories produced by dynamical systems and the distance between similar sequences processed by a siamese recurrent neural network. \nIndeed, a siamese recurrent network comprises two identical sub-networks, two identical dynamical systems which can theoretically achieve complete synchronization if a coupling is introduced between them. \nWe therefore propose a new neural network model that implements this coupling with a new gate integrated into the classical Gated Recurrent Unit architecture. This model is thus able to simultaneously learn a similarity metric and the synchronization of unaligned multi-variate sequences in a weakly supervised way.\nOur experiments show that introducing such a coupling improves the performance of the siamese Gated Recurrent Unit architecture on an activity recognition dataset. ",
+    "title": "Sequence Metric Learning as Synchronization of Recurrent Neural Networks",
+    "authors": [
+      "Paul Compagnon",
+      "Gr\u00e9goire Lefebvre",
+      "Stefan Duffner",
+      "Christophe Garcia"
+    ],
+    "emails": [
+      "~Paul_Compagnon1",
+      "orange.com",
+      "~Christophe_Garcia1",
+      "~Stefan_Duffner1"
+    ],
+    "rank": 2462
+  },
+  {
+    "url": "https://openreview.net/forum?id=HNytlGv1VjG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "A wide breadth of research has devised data augmentation approaches that can improve both accuracy and generalization performance for neural networks. However, augmented data can end up being far from the clean data and what is the appropriate label is less clear. Despite this, most existing work simply reuses the original label from the clean data, and the choice of label accompanying the augmented data is relatively less explored. In this paper, we propose AutoLabel to automatically learn the labels for augmented data, based on the distance between the clean distribution and augmented distribution. AutoLabel is built on label smoothing and is guided by the calibration-performance over a hold-out validation set. We show that AutoLabel is a generic framework that can be easily applied to existing data augmentation methods, including AugMix, mixup, and adversarial training. Experiments on CIFAR-10, CIFAR-100 and ImageNet show that AutoLabel can improve models' accuracy and calibration performance, especially under distributional shift. Additionally, we demonstrate that AutoLabel can help adversarial training by bridging the gap between clean accuracy and adversarial robustness.",
+    "title": "What are effective labels for augmented data? Improving robustness with AutoLabel",
+    "authors": [
+      "Yao Qin",
+      "Xuezhi Wang",
+      "Balaji Lakshminarayanan",
+      "Ed Chi",
+      "Alex Beutel"
+    ],
+    "emails": [
+      "~Xuezhi_Wang3",
+      "~Balaji_Lakshminarayanan1",
+      "~Ed_Chi1",
+      "~Alex_Beutel1",
+      "~Yao_Qin1"
+    ],
+    "rank": 2463
+  },
+  {
+    "url": "https://openreview.net/forum?id=ls8D_-g8-ne",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.27",
+    "confidences": [
+      2,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Efficient design of biological sequences will have a great impact across many industrial and healthcare domains. However, discovering improved sequences requires solving a difficult optimization problem. Traditionally, this challenge was approached by biologists through a model-free method known as \u201cdirected evolution\u201d, the iterative process of random mutation and selection. As the ability to build models that capture the sequence-to-function map improves, such models can be used as oracles to screen sequences before running experiments. In recent years, interest in better algorithms that effectively use such oracles to outperform model-free approaches has intensified. These span from approaches based on Bayesian Optimization, to regularized generative models and adaptations of reinforcement learning. In this work, we implement an open-source Fitness Landscape EXploration Sandbox (FLEXS) environment to test and evaluate these algorithms based on their optimality, consistency, and robustness. Using FLEXS, we develop an easy-to-implement, scalable, and robust evolutionary greedy algorithm (AdaLead). Despite its simplicity, we show that AdaLead is a remarkably strong benchmark that out-competes more complex state of the art approaches in a variety of biologically motivated sequence design challenges. ",
+    "title": "AdaLead: A simple and robust adaptive greedy search algorithm for sequence design",
+    "authors": [
+      "Sam Sinai",
+      "Richard Wang",
+      "Alexander Whatley",
+      "Stewart Slocum",
+      "Elina Locane",
+      "Eric Kelsic"
+    ],
+    "emails": [
+      "~Sam_Sinai1",
+      "jhu.edu",
+      "dynotx.com",
+      "robustintelligence.com",
+      "berkeley.edu"
+    ],
+    "rank": 2464
+  },
+  {
+    "url": "https://openreview.net/forum?id=fV2ScEA03Hg",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      4,
+      3
+    ],
+    "rating": "4.27",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Mislabeled samples cause prediction errors. This study proposes a solution to the problem of incorrect labels, called AutoCleansing, to automatically capture the effect of incorrect labels and mitigate it without removing the mislabeled samples. AutoCleansing consists of a base network model and sample-category specific constants. Both parameters of the base model and sample-category constants are estimated simultaneously using the training data. Thereafter, predictions for test data are made using a base model without the constants capturing the mislabeled effects. A theoretical model for AutoCleansing is developed and showing that the gradient of the loss function of the proposed method can be zero at true parameters with mislabeled data if the model is correctly constructed.  Experimental results show that AutoCleansing has better performance in test accuracy than previous studies for CIFAR-10, CIFAR-100, SVHN, and ImageNet datasets.",
+    "title": "AutoCleansing: Unbiased Estimation of Deep Learning with Mislabeled Data",
+    "authors": [
+      "Koichi Kuriyama"
+    ],
+    "emails": [
+      "~Koichi_Kuriyama1"
+    ],
+    "rank": 2465
+  },
+  {
+    "url": "https://openreview.net/forum?id=rVdLv-uzYup",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Existing model-based reinforcement learning methods often study perception modeling and decision making separately. We introduce joint Perception and Control as Inference (PCI), a general framework to combine perception and control for partially observable environments through Bayesian inference. Based on the fact that object-level inductive biases are critical in human perceptual learning and reasoning, we propose Object-based Perception Control (OPC), an instantiation of PCI which manages to facilitate control using automatic discovered object-based representations. We develop an unsupervised end-to-end solution and analyze the convergence of the perception model update. Experiments in a high-dimensional pixel environment demonstrate the learning effectiveness of our object-based perception control approach. Specifically, we show that OPC achieves good perceptual grouping quality and outperforms several strong baselines in accumulated rewards.",
+    "title": "Joint Perception and Control as Inference with an Object-based Implementation",
+    "authors": [
+      "Minne Li",
+      "Zheng Tian",
+      "Pranav Nashikkar",
+      "Ian Davies",
+      "Ying Wen",
+      "Jun Wang"
+    ],
+    "emails": [
+      "~Ying_Wen1",
+      "ucl.ac.uk",
+      "~Minne_Li1",
+      "~Jun_Wang2",
+      "~Zheng_Tian1"
+    ],
+    "rank": 2466
+  },
+  {
+    "url": "https://openreview.net/forum?id=37Fh1MiR5Ze",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.27",
+    "confidences": [
+      5,
+      3,
+      3
+    ],
+    "abstract": "Despite the complicated structure of modern deep neural network architectures, they are still optimized with algorithms based on Stochastic Gradient Descent (SGD). However, the reason behind the effectiveness of SGD is not well understood, making its study an active research area. In this paper, we formulate deep neural network optimization as a dynamical system and show that the rigorous theory developed to study chaotic systems can be useful to understand SGD and its variants. In particular, we first observe that the inverse of the instability timescale of SGD optimization, represented by the largest Lyapunov exponent, corresponds to the most negative eigenvalue of the Hessian of the loss. This observation enables the introduction of an efficient method to estimate the largest eigenvalue of the Hessian. Then, we empirically show that for a large range of learning rates, SGD traverses the loss landscape across regions with largest eigenvalue of the Hessian similar to the inverse of the learning rate. This explains why effective learning rates can be found to be within a large range of values and shows that SGD implicitly uses the largest eigenvalue of the Hessian while traversing the loss landscape. This sheds some light on the effectiveness of SGD over more sophisticated second-order methods. We also propose a quasi-Newton method that dynamically estimates an optimal learning rate for the optimization of deep learning models. We demonstrate that our observations and methods are robust across different architectures and loss functions on CIFAR-10 dataset.",
+    "title": "A Chaos Theory Approach to Understand Neural Network Optimization",
+    "authors": [
+      "Michele Sasdelli",
+      "Thalaiyasingam Ajanthan",
+      "Tat-Jun Chin",
+      "Gustavo Carneiro"
+    ],
+    "emails": [
+      "~Gustavo_Carneiro1",
+      "~Tat-Jun_Chin2",
+      "~Michele_Sasdelli1",
+      "~Thalaiyasingam_Ajanthan1"
+    ],
+    "rank": 2467
+  },
+  {
+    "url": "https://openreview.net/forum?id=7xArdn_FKtV",
+    "ratings": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.26",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "We propose an effective heterogeneous model transfer (HMT) method that can transfer the knowledge from one pretrained neural network to another neural network. Most of the existing deep learning methods depend much on a pretraining-finetuning strategy, i.e., pretraining a deep model on a large task-related (source) dataset and finetuning it on a small target dataset. Pretraining provides a universal feature representation for the target learning task and thus reduces the overfitting on a small target dataset. However, it is often assumed that the pretrained model and the target model share an identical backbone, which significantly limits the scalability of pretrained deep models. This paper relaxes this limitation and generalizes to heterogeneous model transfer between two different neural networks. Specifically, we select the longest chain from the source model and transfer it to the longest chain of the target model. Motivated by one-shot neural architecture search methods, the longest chain inherits merits from the source model and also serves as a weight-sharing path of the target model, thus provides a good initialization. With the longest chains, the layer-to-layer weight transfer is then transformed by bilinear interpolation and cyclic stack. HMT opens a new window for the pretraining-finetuning strategy and significantly improves the reuse efficiency of pretrained models without re-pretraining on the large source dataset. Experiments on several datasets show the effectiveness of HMT. Anonymous code is at: <a href=\"https://anonymous.4open.science/r/6ab184dc-3c64-4fdd-ba6d-1e5097623dfd/\" target=\"_blank\" rel=\"nofollow\">https://anonymous.4open.science/r/6ab184dc-3c64-4fdd-ba6d-1e5097623dfd/</a>",
+    "title": "Heterogeneous Model Transfer between Different Neural Networks",
+    "authors": [
+      "Guangcong Wang",
+      "Jianhuang Lai",
+      "Wenqi Liang",
+      "Guangrun Wang"
+    ],
+    "emails": [
+      "~Guangrun_Wang1",
+      "mail2.sysu.edu.cn",
+      "~Guangcong_Wang1",
+      "~Jianhuang_Lai1"
+    ],
+    "rank": 2468
+  },
+  {
+    "url": "https://openreview.net/forum?id=uMNWbpIQP26",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": " The following questions are fundamental to understanding the properties of over-parameterization in modern machine learning: (1) Under what conditions and at what rate does training converge to a global minimum? (2) What form of implicit regularization occurs through training? While significant progress has been made in answering both of these questions for gradient descent, they have yet to be answered more completely for general optimization methods. In this work, we establish sufficient conditions for linear convergence and obtain approximate implicit regularization results for generalized mirror descent (GMD), a generalization of mirror descent with a possibly time-dependent mirror. GMD subsumes popular first order optimization methods including gradient descent, mirror descent, and preconditioned gradient descent methods such as Adagrad. By using the Polyak-Lojasiewicz inequality, we first present a simple analysis under which non-stochastic GMD converges linearly to a global minimum. We then present a novel, Taylor-series based analysis to establish sufficient conditions for linear convergence of stochastic GMD. As a corollary, our result establishes sufficient conditions and provides learning rates for linear convergence of stochastic mirror descent and Adagrad. Lastly, we obtain approximate implicit regularization results for GMD by proving that GMD converges to an interpolating solution that is approximately the closest interpolating solution to the initialization in l2-norm in the dual space, thereby generalizing the result of Azizan, Lale, and Hassibi (2019) in the full batch setting. ",
+    "title": "Linear Convergence and Implicit Regularization of Generalized Mirror Descent with Time-Dependent Mirrors",
+    "authors": [
+      "Adityanarayanan Radhakrishnan",
+      "Mikhail Belkin",
+      "Caroline Uhler"
+    ],
+    "emails": [
+      "~Adityanarayanan_Radhakrishnan1",
+      "ucsd.edu",
+      "~Caroline_Uhler1"
+    ],
+    "rank": 2469
+  },
+  {
+    "url": "https://openreview.net/forum?id=r7qgus1bZ2",
+    "ratings": [
+      4,
+      5,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "One-shot object detection (OSOD) aims at detecting all instances that are consistent with the category of the single reference image. OSOD achieves object detection by comparing the query image and the reference image. We observe that the essential problem behind the limited performance of OSOD is that OSOD generates a lot of false positives due to its poor classification ability. This paper analyzes the serious false positive problem in OSOD and proposes a Focus on Classification One-Shot Object Detection (FOC OSOD) framework, which is improved in two important aspects: (1) classification cascade head with the fixed IoU threshold can enhance the robustness of classification by comparing multiple close regions; (2) classification region deformation on the query feature and the reference feature to obtain a more effective comparison region. Without bells and whistles, a single FOC obtains 1.8% AP and 1.3% AP improvement on the seen classes and the unseen classes over a Siamese Faster R-CNN baseline on the MS-COCO dataset in the one-shot setting. The code will be available.",
+    "title": "FOC OSOD: Focus on Classification One-Shot Object Detection",
+    "authors": [
+      "Hanqing Yang",
+      "Huaijin Pi",
+      "SABA GHORBANI BARZEGAR",
+      "Yu Zhang"
+    ],
+    "emails": [
+      "~Hanqing_Yang1",
+      "~SABA_GHORBANI_BARZEGAR1",
+      "~Yu_Zhang31",
+      "~Huaijin_Pi1"
+    ],
+    "rank": 2470
+  },
+  {
+    "url": "https://openreview.net/forum?id=7WwYBADS3E_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present a data-driven model for fluid simulation under Lagrangian representation. Our model uses graphs to describe the fluid field, where physical quantities are encoded as node and edge features. Instead of directly predicting the acceleration or position correction given the current state, we decompose the simulation scheme into separate parts - advection, collision, and pressure projection. For these different reasoning tasks, we propose two kinds of graph neural network structures, node-focused networks, and edge-focused networks. By introducing physics prior knowledge, our model can be efficient in terms of training and inference. Our tests show that the learned model can produce accurate results and remain stable in scenarios with a large amount of particles and different geometries. Unlike many previous works, further tests demonstrate that our model is able to retain many important physical properties of incompressible fluids, such as minor divergence and reasonable pressure distribution. Additionally, our model can adopt a range of time step sizes different from ones using in the training set, which indicates its robust generalization capability.",
+    "title": "Learning Lagrangian Fluid Dynamics with Graph Neural Networks",
+    "authors": [
+      "Zijie Li",
+      "Amir Barati Farimani"
+    ],
+    "emails": [
+      "cmu.edu",
+      "~Zijie_Li2"
+    ],
+    "rank": 2471
+  },
+  {
+    "url": "https://openreview.net/forum?id=hnJSgY7p33a",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Self-supervised learning is currently gaining a lot of attention, as it allows neural networks to learn robust representations from large quantities of unlabeled data. Additionally, multi-task learning can further improve representation learning by training networks simultaneously on related tasks, leading to significant performance improvements. In this paper, we propose a general framework to improve graph-based neural network models by combining self-supervised auxiliary learning tasks in a multi-task fashion. Since Graph Convolutional Networks are among the most promising approaches for capturing relationships among structured data points, we use them as a building block to achieve competitive results on standard semi-supervised graph classification tasks.",
+    "title": "Graph-Based Neural Network Models with Multiple Self-Supervised Auxiliary Tasks",
+    "authors": [
+      "Franco Manessi",
+      "Alessandro Rozza"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Alessandro_Rozza2"
+    ],
+    "rank": 2472
+  },
+  {
+    "url": "https://openreview.net/forum?id=_lV1OrJIgiG",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Efficiently training agents with planning capabilities has long been one of the major challenges in decision-making. In this work, we focus on zero-shot navigation ability on a given abstract 2-D occupancy map, like human navigation by reading a paper map, by treating it as an image. To learn this ability, we need to efficiently train an agent on environments with a small proportion of training maps and share knowledge effectively across the environments. We hypothesize that model-based navigation can better adapt an agent's behaviors to a task, since it disentangles the variations in map layout and goal location and enables longer-term planning ability on novel locations compared to reactive policies. We propose to learn a hypermodel that can understand patterns from a limited number of abstract maps and goal locations, to maximize alignment between the hypermodel predictions and real trajectories to extract information from multi-task off-policy experiences, and to construct denser feedback for planners by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>-step goal relabelling. We train our approach on DeepMind Lab environments with layouts from different maps, and demonstrate superior performance on zero-shot transfer to novel maps and goals.",
+    "title": "Model-based Navigation in Environments with Novel Layouts Using Abstract 2-D Maps",
+    "authors": [
+      "Linfeng Zhao",
+      "Lawson L. S. Wong"
+    ],
+    "emails": [
+      "~Linfeng_Zhao1",
+      "~Lawson_L._S._Wong1"
+    ],
+    "rank": 2473
+  },
+  {
+    "url": "https://openreview.net/forum?id=KRKGJrbPcKE",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "Multiple instance learning (MIL) is a machine learning paradigm which learns the mapping between bags of instances and bag labels. There are different MIL tasks which can be solved by different MIL methods. One common component of all MIL methods is the MIL pooling filter, which obtains bag level representations from extracted features of instances. Here, we recommend and discuss a grouping scheme for MIL pooling filters: point estimate based pooling filters and distribution based pooling filters. The point estimate based pooling filters include the standard pooling filters, such as \u2018max\u2019, \u2018mean\u2019 and \u2018attention\u2019 pooling. The distribution based pooling filters include recently proposed \u2018distribution\u2019 pooling and newly designed \u2018distribution with attention\u2019 pooling. In this paper, we perform the first systematic analysis of different pooling filters. We theoretically showed that the distribution based pooling filters are superior to the point estimate based counterparts in terms of amount of information captured while obtaining bag level representations from extracted features. Then, we empirically study the performance of the 5 pooling filters, namely \u2018max\u2019, \u2018mean\u2019, \u2018attention\u2019, \u2018distribution\u2019 and \u2018distribution with attention\u2019, on distinct real world MIL tasks. We showed that the performance of different pooling filters are different for different MIL tasks. Moreover, consistent with our theoretical analysis, models with distribution based pooling filters almost always performed equal or better than that with point estimate based pooling filters.",
+    "title": "Distribution Based MIL Pooling Filters are Superior to Point Estimate Based Counterparts",
+    "authors": [
+      "Mustafa Umit Oner",
+      "Jared Marc Song",
+      "Hwee Kuan Lee",
+      "Wing-Kin Sung"
+    ],
+    "emails": [
+      "~Hwee_Kuan_Lee1",
+      "~Mustafa_Umit_Oner1",
+      "~Wing-Kin_Sung1",
+      "~Jared_Marc_Song1"
+    ],
+    "rank": 2474
+  },
+  {
+    "url": "https://openreview.net/forum?id=8EGmvcCVrmZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      3,
+      5,
+      3,
+      1
+    ],
+    "abstract": "In singular models, the optimal set of parameters forms an analytic set with singularities and classical statistical inference cannot be applied to such models. This is significant for deep learning as neural networks are singular and thus ``dividing\" by the determinant of the Hessian or employing the Laplace approximation are not appropriate. Despite its potential for addressing fundamental issues in deep learning, singular learning theory appears to have made little inroads into the developing canon of deep learning theory. Via a mix of theory and experiment, we present an invitation to singular learning theory as a vehicle for understanding deep learning and suggest important future work to make singular learning theory directly applicable to how deep learning is performed in practice. ",
+    "title": "Deep Learning is Singular, and That's Good",
+    "authors": [
+      "Daniel Murfet",
+      "Susan Wei",
+      "Mingming Gong",
+      "Hui Li",
+      "Jesse Gell-Redman",
+      "Thomas Quella"
+    ],
+    "emails": [
+      "~Mingming_Gong1",
+      "unimelb.edu.au",
+      "~Susan_Wei1",
+      "~Daniel_Murfet1",
+      "student.unimelb.edu.au"
+    ],
+    "rank": 2475
+  },
+  {
+    "url": "https://openreview.net/forum?id=3cCWBFRuZBI",
+    "ratings": [
+      4,
+      3,
+      6
+    ],
+    "rating": "4.25",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "Uncertainty estimation is an essential step in the evaluation of the robustness for deep learning models in computer vision, especially when applied in risk-sensitive areas. However, most state-of-the-art deep learning models either fail to obtain uncertainty estimation or need significant modification (e.g., formulating a proper Bayesian treatment) to obtain it. Most previous methods are not able to take an arbitrary model off the shelf and generate uncertainty estimation without retraining or redesigning it. To address this gap, we perform a systematic exploration into training-free uncertainty estimation for dense regression, an unrecognized yet important problem, and provide a theoretical construction justifying such estimations. We propose three simple and scalable methods to analyze the variance of outputs from a trained network under tolerable perturbations: infer-transformation, infer-noise, and infer-dropout. They operate solely during inference, without the need to re-train, re-design, or fine-tune the model, as typically required by state-of-the-art uncertainty estimation methods. Surprisingly, even without involving such perturbations in training, our methods produce comparable or even better uncertainty estimation when compared to training-required state-of-the-art methods. ",
+    "title": "Training-Free Uncertainty Estimation for Dense Regression: Sensitivity as a Surrogate",
+    "authors": [
+      "Lu Mi",
+      "Hao Wang",
+      "Yonglong Tian",
+      "Nir Shavit"
+    ],
+    "emails": [
+      "~Nir_Shavit1",
+      "~Lu_Mi1",
+      "~Hao_Wang3",
+      "~Yonglong_Tian1"
+    ],
+    "rank": 2476
+  },
+  {
+    "url": "https://openreview.net/forum?id=YgrdmztE4OY",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Federated learning (FL) has emerged as the predominant approach for collaborative training of neural network models across multiple users, without the need to gather the data at a central location. One of the important challenges in this setting is data heterogeneity; different users have different data characteristics. For this reason, training and using a single global model might be suboptimal when considering the performance of each of the individual user\u2019s data. In this work, we tackle this problem via Federated Mixture of Experts, FedMix, a framework that allows us to train an ensemble of specialized models. FedMix adaptively selects and trains a user-specific selection of the ensemble members. We show that users with similar data characteristics select the same members and therefore share statistical strength while mitigating the effect of non-i.i.d data. Empirically, we show through an extensive experimental evaluation that FedMix improves performance compared to using a single global model while requiring similar or less communication costs.",
+    "title": "Federated Mixture of Experts",
+    "authors": [
+      "Matthias Reisser",
+      "Christos Louizos",
+      "Efstratios Gavves",
+      "Max Welling"
+    ],
+    "emails": [
+      "~Matthias_Reisser1",
+      "~Max_Welling1",
+      "~Efstratios_Gavves1",
+      "~Christos_Louizos1"
+    ],
+    "rank": 2477
+  },
+  {
+    "url": "https://openreview.net/forum?id=D4QFCXGe_z2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "Attention mechanisms are generic inductive biases that have played a critical role in improving the state-of-the-art in supervised learning, unsupervised pre-training and generative modeling for multiple domains including vision, language and speech. However, they remain relatively under-explored for neural network architectures typically used in reinforcement learning (RL) from high dimensional inputs such as pixels. In this paper, we propose and study the effectiveness of augmenting a simple attention module in the convolutional encoder of an RL agent. Through experiments on the widely benchmarked DeepMind Control Suite environments, we demonstrate that our proposed module can (i) extract interpretable task-relevant information such as agent locations and movements without the need for data augmentations or contrastive losses; (ii) significantly improve the sample-efficiency and final performance of the agents. We hope our simple and effective approach will serve as a strong baseline for future research incorporating attention mechanisms in reinforcement learning and control.",
+    "title": "R-LAtte: Attention Module for Visual Control via Reinforcement Learning",
+    "authors": [
+      "Mandi Zhao",
+      "Qiyang Li",
+      "Aravind Srinivas",
+      "Ignasi Clavera",
+      "Kimin Lee",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Kimin_Lee1",
+      "~Qiyang_Li1",
+      "~Aravind_Srinivas1",
+      "~Mandi_Zhao1",
+      "~Ignasi_Clavera1",
+      "~Pieter_Abbeel2"
+    ],
+    "rank": 2478
+  },
+  {
+    "url": "https://openreview.net/forum?id=_OGAW_hznmG",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      6
+    ],
+    "rating": "4.25",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Maximum entropy (MAXENT) method has a large number of\napplications in theoretical and applied machine learning, since it\nprovides a convenient non-parametric tool for estimating unknown\nprobabilities.  The method is a major contribution of statistical\nphysics to probabilistic inference. However, a systematic approach\ntowards its validity limits is currently missing.  Here we study MAXENT\nin a Bayesian decision theory set-up, i.e. assuming that there exists a\nwell-defined prior Dirichlet density for unknown probabilities, and that\nthe average Kullback-Leibler (KL) distance can be employed for deciding\non the quality and applicability of various estimators.  These allow to\nevaluate the relevance of various MAXENT constraints, check its general\napplicability, and compare MAXENT with estimators having various degrees\nof dependence on the prior, {\\it viz.} the regularized maximum\nlikelihood (ML) and the Bayesian estimators. We show that MAXENT applies\nin sparse data regimes, but needs specific types of prior information. \nIn particular, MAXENT can outperform the optimally regularized\nML provided that there are prior rank correlations between the estimated\nrandom quantity and its probabilities. ",
+    "title": "Maximum Entropy competes with Maximum Likelihood",
+    "authors": [
+      "Armen Allahverdyan"
+    ],
+    "emails": [
+      "~Armen_Allahverdyan1"
+    ],
+    "rank": 2479
+  },
+  {
+    "url": "https://openreview.net/forum?id=paUVOwaXTAR",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Conditional computation and modular networks have been recently proposed for multitask learning and other problems as a way to decompose problem solving into multiple reusable computational blocks. We propose a novel fully-differentiable approach for learning modular networks.In our method, the modules can be invoked repeatedly and allow knowledge transfer to novel tasks by adjusting the order of computation. This allows soft weight sharing between tasks with only a small increase in the number of parameters. We show that our method leads to interpretable self-organization of modules in case of multi-task learning, transfer learning and domain adaptation while achieving competitive results on those tasks. From practical perspective, our approach allows to: (a) reuse existing modules for learning new task by adjusting the computation order, (b) use it for unsupervised multi-source domain adaptation to illustrate that adaptation to unseen data can be achieved by only manipulating the order of pretrained modules, (c) show how our approach can be used to increase accuracy of existing architectures for image classification tasks such as ImageNet, without any parameter increase, by reusing the same block multiple times.",
+    "title": "Compositional Models: Multi-Task Learning and Knowledge Transfer with Modular Networks",
+    "authors": [
+      "Andrey Zhmoginov",
+      "Dina Bashkirova",
+      "Mark Sandler"
+    ],
+    "emails": [
+      "~Dina_Bashkirova1",
+      "~Mark_Sandler1",
+      "~Andrey_Zhmoginov1"
+    ],
+    "rank": 2480
+  },
+  {
+    "url": "https://openreview.net/forum?id=pwIDi7D8Lf",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Leveraging well-established MCMC strategies, we propose MCMC-interactive variational inference (MIVI) to not only estimate the posterior in a time constrained manner, but also facilitate the design of MCMC transitions. Constructing a variational distribution followed by a short Markov chain that has parameters to learn, MIVI takes advantage of the complementary properties of variational inference and MCMC to encourage mutual improvement. On one hand, with the variational distribution locating high posterior density regions, the Markov chain is optimized within the variational inference framework to efficiently target the posterior despite a small number of transitions. On the other hand, the optimized Markov chain with considerable flexibility guides the variational distribution towards the posterior and alleviates its underestimation of uncertainty. Furthermore, we prove the optimized Markov chain in MIVI admits extrapolation, which means its marginal distribution gets closer to the true posterior as the chain grows. Therefore, the Markov chain can be used separately as an efficient MCMC scheme. Experiments show that MIVI not only accurately and efficiently approximates the posteriors but also facilitates designs of stochastic gradient MCMC and transition kernels for Gibbs sampling.",
+    "title": "MCMC-Interactive Variational Inference",
+    "authors": [
+      "Quan Zhang",
+      "Huangjie Zheng",
+      "Mingyuan Zhou"
+    ],
+    "emails": [
+      "~Quan_Zhang1",
+      "~Mingyuan_Zhou1",
+      "~Huangjie_Zheng1"
+    ],
+    "rank": 2481
+  },
+  {
+    "url": "https://openreview.net/forum?id=vC8hNRk9dOR",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      6
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Online Continual Learning (OCL) studies learning over a continuous data stream without observing any single example more than once, a setting that is closer to the experience of humans and systems that must learn \u201con-the-wild\u201d. Yet, commonly available benchmarks are far from these real world conditions, because they explicitly signal different tasks, lack latent similarity structure or assume temporal independence between different examples. Here, we propose a new benchmark for OCL based on language modelling in which input alternates between different languages and domains without any explicit delimitation. Additionally, we propose new metrics to study catastrophic forgetting in this setting and evaluate multiple baseline models based on compositions of experts. Finally, we introduce a simple gating technique that learns the latent similarities between different inputs, improving the performance of a Products of Experts model.",
+    "title": "Evaluating Online Continual Learning with CALM",
+    "authors": [
+      "Germ\u00e1n Kruszewski",
+      "Ionut Teodor Sorodoc",
+      "Tomas Mikolov"
+    ],
+    "emails": [
+      "~Germ\u00e1n_Kruszewski1",
+      "~Tomas_Mikolov1",
+      "~Ionut_Teodor_Sorodoc1"
+    ],
+    "rank": 2482
+  },
+  {
+    "url": "https://openreview.net/forum?id=bQNosljkHj",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.25",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We present  geometric Bayesian active learning by disagreements (GBALD), a framework that performs BALD on its geometric interpretation interacting with a deep learning model. There are two main components in GBALD: initial acquisitions based on core-set construction and model uncertainty estimation with those initial acquisitions. Our key innovation is to construct the core-set on an ellipsoid, not typical sphere, preventing its updates towards the boundary regions of the distributions. Main improvements over BALD are twofold: relieving sensitivity to uninformative prior and reducing redundant information of model uncertainty. To guarantee the improvements, our generalization analysis proves that, compared to typical Bayesian  spherical interpretation, geodesic search with ellipsoid can derive a tighter lower error bound and achieve higher probability to obtain  a nearly zero error. Experiments on acquisitions with several scenarios demonstrate that, yielding slight perturbations to noisy and repeated samples,  GBALD further achieves significant accuracy improvements  than BALD, BatchBALD and other baselines.",
+    "title": "On the Geometry of Deep Bayesian Active Learning",
+    "authors": [
+      "Xiaofeng Cao",
+      "Ivor Tsang"
+    ],
+    "emails": [
+      "~Ivor_Tsang1",
+      "~Xiaofeng_Cao1"
+    ],
+    "rank": 2483
+  },
+  {
+    "url": "https://openreview.net/forum?id=k2Hm5Szfl5Z",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3
+    ],
+    "rating": "4.25",
+    "confidences": [
+      2,
+      3,
+      3
+    ],
+    "abstract": "We consider the Principal Component Analysis (PCA) problem for tensors <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2297\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi><mo>\u2208</mo><mo stretchy=\"false\">(</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup><msup><mo stretchy=\"false\">)</mo><mrow><mo>\u2297</mo><mi>k</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> of large dimension <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> and of arbitrary order <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi><mo>\u2265</mo><mn>3</mn></math></mjx-assistive-mml></mjx-container>. It consists in recovering a spike <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.297em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2297\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-spacer style=\"margin-top: 0.18em;\"></mjx-spacer><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msubsup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msubsup><mi>v</mi><mn>0</mn><mrow><mo>\u2297</mo><mi>k</mi></mrow></msubsup></math></mjx-assistive-mml></mjx-container> (related to a signal vector <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>v</mi><mn>0</mn></msub><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup></math></mjx-assistive-mml></mjx-container>) corrupted by a Gaussian noise tensor <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44D TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msup><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2297\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Z</mi><mo>\u2208</mo><mo stretchy=\"false\">(</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mi>n</mi></msup><msup><mo stretchy=\"false\">)</mo><mrow><mo>\u2297</mo><mi>k</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> such that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-msubsup><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.297em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2297\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-texatom><mjx-spacer style=\"margin-top: 0.18em;\"></mjx-spacer><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msubsup><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D44D TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi><mo>=</mo><mi>\u03b2</mi><msubsup><mi>v</mi><mn>0</mn><mrow><mo>\u2297</mo><mi>k</mi></mrow></msubsup><mo>+</mo><mi>Z</mi></math></mjx-assistive-mml></mjx-container> where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> is the signal-to-noise ratio. In this paper, we propose a new framework based on tools developed by the theoretical physics community to address this important problem. They consist in trace invariants of tensors built by judicious contractions (extension of matrix product) of the indices of the tensor <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container>. Inspired by these tools, we introduce a new process that builds for each invariant a matrix whose top eigenvector is correlated to the signal for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> sufficiently large. Then, we give examples of classes of invariants for which we demonstrate that this correlation happens above the best algorithmic threshold (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2265\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi><mo>\u2265</mo><msup><mi>n</mi><mrow><mi>k</mi><mrow><mo>/</mo></mrow><mn>4</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>) known so far. This method has many algorithmic advantages: (i) it provides a detection algorithm linear in time and that has only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> memory requirements (ii) the algorithms are very suitable for parallel architectures and have a lot of potential of optimization given the simplicity of the mathematical tools involved (iii) experimental results show an improvement of the state of the art for the symmetric tensor PCA. Furthermore, this framework allows more general applications by being able to theoretically study the recovery of a spike in the form of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2297\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c22EF\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2297\"></mjx-c></mjx-mo><mjx-msub space=\"3\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>v</mi><mn>1</mn></msub><mo>\u2297</mo><mo>\u22ef</mo><mo>\u2297</mo><msub><mi>v</mi><mi>k</mi></msub></math></mjx-assistive-mml></mjx-container> with different dimensions (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"13\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.164em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.164em;\"><mjx-mn class=\"mjx-n\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c22EF\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.186em;\"><mjx-mi class=\"mjx-i\" style=\"font-size: 77.7%;\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><msub><mi>n</mi><mn>1</mn></msub><mo>\u00d7</mo><msub><mi>n</mi><mn>2</mn></msub><mo>\u00d7</mo><mo>\u22ef</mo><mo>\u00d7</mo><msub><mi>n</mi><mi>k</mi></msub></mrow></msup></math></mjx-assistive-mml></mjx-container> with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"14\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2026\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-texatom space=\"4\" texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c2115 TEX-A\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>n</mi><mn>1</mn></msub><mo>,</mo><mo>\u2026</mo><mo>,</mo><msub><mi>n</mi><mi>k</mi></msub><mo>\u2208</mo><mrow><mi mathvariant=\"double-struck\">N</mi></mrow></math></mjx-assistive-mml></mjx-container>) as well as the recovery of a sum of different orthogonal spikes. We provide experimental results to these different cases that match well with our theoretical findings.",
+    "title": "A new framework for tensor PCA based on trace invariants",
+    "authors": [
+      "Mohamed Ouerfelli",
+      "mohamed Tamaazousti",
+      "Vincent Rivasseau"
+    ],
+    "emails": [
+      "~mohamed_Tamaazousti1",
+      "~Mohamed_Ouerfelli1",
+      "gmail.com"
+    ],
+    "rank": 2484
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZJGnFbd6vW",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      7,
+      2
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Existing deep learning methodologies within the medical domain are typically population-based and difficult to interpret. This limits their clinical utility as population-based findings may not generalize to the individual patient. To overcome these obstacles, we propose to learn patient-specific representations, entitled patient cardiac prototypes (PCPs), that efficiently summarize the cardiac state of a patient. We show that PCPs, learned in an end-to-end manner via contrastive learning, allow for the discovery of similar patients both within and across datasets, and can be exploited for dataset distillation as a compact substitute for the original dataset.",
+    "title": "PCPs: Patient Cardiac Prototypes",
+    "authors": [
+      "Dani Kiyasseh",
+      "Tingting Zhu",
+      "David A. Clifton"
+    ],
+    "emails": [
+      "~Dani_Kiyasseh1",
+      "~David_A._Clifton1",
+      "eng.ox.ac.uk"
+    ],
+    "rank": 2485
+  },
+  {
+    "url": "https://openreview.net/forum?id=YYvtmM9TPw",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Self-supervised representation learning has seen remarkable progress in the last few years. More recently, contrastive instance learning has shown impressive results compared to its supervised learning counterparts, in particular on downstream tasks like image classification and object detection. However, even with the ever increased interest in contrastive instance learning, it is still largely unclear why these methods work so well. In this paper, we aim to unravel some of the mysteries behind their success, which are the good practices. In particular, we investigate why the nonlinear projection head is essential, why instance discrimination does not suffer from strong data augmentation, and if large amounts of negative samples are required during contrastive loss computation. Through an extensive empirical analysis, we hope to not only provide insights but also lay out a set of best practices that led to the success of recent work in self-supervised representation learning.",
+    "title": "Towards Good Practices in Self-Supervised Representation Learning",
+    "authors": [
+      "srikar appalaraju",
+      "Yi Zhu",
+      "Yusheng Xie",
+      "Istvan Fehervari"
+    ],
+    "emails": [
+      "~Istvan_Fehervari1",
+      "~Yi_Zhu1",
+      "~srikar_appalaraju1",
+      "~Yusheng_Xie1"
+    ],
+    "rank": 2486
+  },
+  {
+    "url": "https://openreview.net/forum?id=0Zxk3ynq7jE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      7
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Machine-learned safety-critical systems need to be self-aware and reliably know their unknowns in the open-world. This is often explored through the lens of anomaly/outlier detection or out-of-distribution modeling. One popular formulation is that of open-set classification, where an image classifier trained for 1-of-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> classes should also recognize images belonging to a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mi>K</mi><mo>+</mo><mn>1</mn><msup><mo stretchy=\"false\">)</mo><mrow><mi>t</mi><mi>h</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> \"other\" class, not present in the training set. Recent work has shown that, somewhat surprisingly, most if not all existing open-world methods do not work well on high-dimensional open-world images (Shafaei et al. 2019). In this paper, we carry out an empirical exploration of open-set classification, and find that combining classic statistical methods with carefully computed features can dramatically outperform prior work. We extract features from off-the-shelf (OTS) state-of-the-art  networks for the underlying <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>-way closed-world task. We leverage insights from the retrieval community for computing feature descriptors that are low-dimensional (via pooling and PCA) and normalized (via L2-normalization), enabling the modeling of training data densities via classic statistical tools such as kmeans and Gaussian Mixture Models (GMMs).",
+    "title": "An Empirical Exploration of Open-Set Recognition via Lightweight Statistical Pipelines",
+    "authors": [
+      "Shu Kong",
+      "Deva Ramanan"
+    ],
+    "emails": [
+      "~Deva_Ramanan1",
+      "~Shu_Kong1"
+    ],
+    "rank": 2487
+  },
+  {
+    "url": "https://openreview.net/forum?id=SReYZtNJXzD",
+    "ratings": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Real-world large-scale datasets usually contain noisy labels and are imbalanced. Therefore, we propose derivative manipulation (DM), a novel and general example weighting approach for training robust deep models under these adverse conditions. \n\nDM has two main merits. First, loss function and example weighting are common techniques in the literature. DM reveals their connection (a loss function does example weighting) and is a replacement of both. Second, despite that a loss defines an example weighting scheme by its derivative, in the loss design, we need to consider whether it is differentiable. Instead, DM is more flexible by directly modifying the derivative so that a loss can be a non-elementary format too. Technically, DM defines an emphasis density function by a derivative magnitude function. DM is generic in that diverse weighting schemes can be derived. \n\nExtensive experiments on both vision and language tasks prove DM\u2019s effectiveness.",
+    "title": "Derivative Manipulation for General Example Weighting",
+    "authors": [
+      "Xinshao Wang",
+      "Elyor Kodirov",
+      "Yang Hua",
+      "Neil M. Robertson"
+    ],
+    "emails": [
+      "~Neil_M._Robertson1",
+      "~Yang_Hua2",
+      "~Elyor_Kodirov1",
+      "~Xinshao_Wang1"
+    ],
+    "rank": 2488
+  },
+  {
+    "url": "https://openreview.net/forum?id=ggNgn8Fhr5Q",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.25",
+    "confidences": [
+      2,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Neural Processes are a powerful tool for learning representations of function spaces purely from examples, in a way that allows them to perform predictions at test time conditioned on so-called context observations. The learned representations are finite-dimensional, while function spaces are infinite-dimensional, and so far it has been unclear how these representations are learned and what kinds of functions can be represented. We show that deterministic Neural Processes implicitly perform a decomposition of the training signals into different frequency components, similar to a Fourier transform. In this context, we derive a theoretical upper bound on the maximum frequency Neural Processes can reproduce, depending on their representation size. This bound is confirmed empirically. Finally, we show that Neural Processes can be trained to only represent a subset of possible frequencies and suppress others, which makes them programmable band-pass or band-stop filters.",
+    "title": "Frequency Decomposition in Neural Processes",
+    "authors": [
+      "Jens Petersen",
+      "Paul F Jaeger",
+      "Gregor Koehler",
+      "David Zimmerer",
+      "Fabian Isensee",
+      "Klaus Maier-Hein"
+    ],
+    "emails": [
+      "~Paul_F_Jaeger1",
+      "~Klaus_Maier-Hein1",
+      "~David_Zimmerer1",
+      "~Gregor_Koehler1",
+      "~Jens_Petersen2",
+      "~Fabian_Isensee1"
+    ],
+    "rank": 2489
+  },
+  {
+    "url": "https://openreview.net/forum?id=aRTRjVPkm-",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper shows how to construct knowledge graphs (KGs) from pre-trained language models (e.g., BERT, GPT-2/3), without human supervision. Popular KGs (e.g, Wikidata, NELL) are built in either a supervised or semi-supervised manner, requiring humans to create knowledge. Recent deep language models automatically acquire knowledge from large-scale corpora via pre-training. The stored knowledge has enabled the language models to improve downstream NLP tasks, e.g., answering questions, and writing code and articles. In this paper, we propose an unsupervised method to cast the knowledge contained within language models into KGs. We show that KGs are constructed with a single forward pass of the pre-trained language models (without fine-tuning) over the corpora. We demonstrate the quality of the constructed KGs by comparing to two KGs (Wikidata, TAC KBP) created by humans. Our KGs also provide open factual knowledge that is new in the existing KGs. Our code and KGs will be made publicly available.",
+    "title": "Language Models are Open Knowledge Graphs",
+    "authors": [
+      "Chenguang Wang",
+      "Xiao Liu",
+      "Dawn Song"
+    ],
+    "emails": [
+      "~Dawn_Song2",
+      "mails.tsinghua.edu.cn",
+      "~Chenguang_Wang1"
+    ],
+    "rank": 2490
+  },
+  {
+    "url": "https://openreview.net/forum?id=XS_E9MiQeVk",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Unsupervised Domain Adaption has great value in both machine learning theory and applications.\nThe core issue is how to minimize the domain shift. Motivated by the more and more sophisticated distribution alignment methods in sample level, we introduce a novel concept named (virtual) mirror, which represents the counterpart sample in the other domains.The newly-introduced mirror loss using the virtual mirrors establishes the connection cross domains and pushes the virtual mirror pairs together in the aligned representation space. Our proposed method does not align the samples cross domains coarsely or arbitrarily, thus does not distort the internal distribution of the underline distribution and brings better asymptotic performances.\nExperiments on several benchmarks validate the superior performance of our methods.",
+    "title": "Mirror Sample Based Distribution Alignment for Unsupervised Domain Adaption",
+    "authors": [
+      "Yin Zhao",
+      "Minquan Wang",
+      "Longjun Cai"
+    ],
+    "emails": [
+      "~Longjun_Cai1",
+      "~Yin_Zhao1",
+      "~Minquan_Wang1"
+    ],
+    "rank": 2491
+  },
+  {
+    "url": "https://openreview.net/forum?id=Kc6XtnDIZdI",
+    "ratings": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.25",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Semi-Supervised Few-shot Learning (SS-FSL) investigates the benefit of incorporating unlabelled data in few-shot settings. Recent work has relied on the popular Semi-Supervised Learning (SSL) concept of iterative pseudo-labelling, yet often yield models that are susceptible to error propagation and are sensitive to initialisation. Alternative work utilises the concept of consistency regularisation (CR), a popular SSL state of the art technique where a student model is trained to consistently agree with teacher predictions under different input perturbations, without pseudo-label requirements. However, applications of CR to the SS-FSL set-up struggle to outperform pseudo-labelling approaches; limited available training data yields unreliable early stage predictions and requires fast convergence that is not amenable for, typically slower to converge, CR approaches. In this paper, we introduce a prototype-based approach for SS-FSL that exploits model consistency in a robust manner. Our Dynamic Prototype Refinement (DPR) approach is a novel training paradigm for few-shot model adaptation to new unseen classes, combining concepts from metric and meta-gradient based FSL methods. New class prototypes are alternatively refined 1) explicitly, using labelled and unlabelled data with high confidence class predictions and 2) implicitly, by model fine-tuning using a data selective CR loss. DPR affords CR convergence, with the explicit refinement providing an increasingly stronger initialisation. We demonstrate method efficacy and report extensive experiments on two competitive benchmarks; miniImageNet and tieredImageNet. The ability to effectively utilise and combine information from both labelled base-class and auxiliary unlabelled novel-class data results in significant accuracy improvements.",
+    "title": "Fewmatch: Dynamic Prototype Refinement for Semi-Supervised Few-Shot Learning",
+    "authors": [
+      "Xu Lan",
+      "Steven McDonagh",
+      "Shaogang Gong",
+      "Jiali Wang",
+      "Zhenguo Li",
+      "Sarah Parisot"
+    ],
+    "emails": [
+      "~Steven_McDonagh1",
+      "~Sarah_Parisot1",
+      "~Shaogang_Gong2",
+      "qmul.ac.uk",
+      "~Xu_Lan2",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 2492
+  },
+  {
+    "url": "https://openreview.net/forum?id=H6ZWlQrPGS2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.24",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Binarized neural networks, networks with weights and activations constrained to lie in a 2-element set, allow for more time- and resource-efficient inference than standard floating-point networks. However, binarized neural networks typically take more training to plateau in accuracy than their floating-point counterparts, in terms of both iteration count and wall clock time. We demonstrate a technique, partial pre-training, that allows for faster from-scratch training of binarized neural networks by first training the network as a standard floating-point network for a short amount of time, then converting the network to a binarized neural network and continuing to train from there. Without tuning any hyperparameters across four networks on three different datasets, partial pre-training is able to train binarized neural networks between <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.26</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.61</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> faster than when training a binarized network from scratch using standard low-precision training.\n",
+    "title": "Fast Binarized Neural Network Training with Partial Pre-training",
+    "authors": [
+      "Alex Renda",
+      "Joshua Wolff Fromm"
+    ],
+    "emails": [
+      "~Joshua_Wolff_Fromm1",
+      "~Alex_Renda2"
+    ],
+    "rank": 2493
+  },
+  {
+    "url": "https://openreview.net/forum?id=T0tmb7uhRhD",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.24",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Federated learning enables multiple parties to collaboratively learn a model without exchanging their local data. Currently, federated averaging (FedAvg) is the most widely used federated learning algorithm. However, FedAvg or its variants have obvious shortcomings. It can only be used to learn differentiable models and needs many communication rounds to converge. In this paper, we propose a novel federated learning algorithm FedKT that needs only a single communication round (i.e., round-optimal). With applying the knowledge transfer approach, our algorithm can be applied to any classification model. Moreover, we develop the differentially private versions of FedKT and theoretically analyze the privacy loss. The experiments show that our method can achieve close or better accuracy compared with the other state-of-the-art federated learning algorithms. ",
+    "title": "Model-Agnostic Round-Optimal Federated Learning via Knowledge Transfer",
+    "authors": [
+      "Qinbin Li",
+      "Bingsheng He",
+      "Dawn Song"
+    ],
+    "emails": [
+      "~Qinbin_Li1",
+      "~Bingsheng_He1",
+      "~Dawn_Song1"
+    ],
+    "rank": 2494
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ip195saXqIX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.24",
+    "confidences": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Knowledge Distillation refers to a class of methods that transfers the knowledge from a teacher network to a student network. In this paper, we propose Sparse Representation Matching (SRM), a method to transfer intermediate knowledge obtained from one Convolutional Neural Network (CNN) to another by utilizing sparse representation learning. SRM first extracts sparse representations of the hidden features of the teacher CNN, which are then used to generate both pixel-level and image-level labels for training intermediate feature maps of the student network. We formulate SRM as a neural processing block, which can be efficiently optimized using stochastic gradient descent and integrated into any CNN in a plug-and-play manner. Our experiments demonstrate that SRM is robust to architectural differences between the teacher and student networks, and outperforms other KD techniques across several datasets. ",
+    "title": "Knowledge Distillation By Sparse Representation Matching",
+    "authors": [
+      "Dat Thanh Tran",
+      "Moncef Gabbouj",
+      "Alexandros Iosifidis"
+    ],
+    "emails": [
+      "~Dat_Thanh_Tran1",
+      "~Moncef_Gabbouj1",
+      "~Alexandros_Iosifidis2"
+    ],
+    "rank": 2495
+  },
+  {
+    "url": "https://openreview.net/forum?id=bXbt3Bvcyud",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.24",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Withdraw",
+    "title": "Withdraw",
+    "authors": [
+      "Zihao WANG",
+      "Xu Zhao",
+      "Tam Le",
+      "Hao Wu",
+      "Yong Zhang",
+      "Makoto Yamada"
+    ],
+    "emails": [
+      "mails.tsinghua.edu.cn",
+      "tsinghua.edu.cn",
+      "~Makoto_Yamada3",
+      "~Tam_Le2",
+      "~Zihao_WANG6"
+    ],
+    "rank": 2496
+  },
+  {
+    "url": "https://openreview.net/forum?id=Xu6K3lGBPAS",
+    "ratings": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.24",
+    "confidences": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Recently, differential privacy has been widely studied in machine learning due to its formal privacy guarantees for data analysis. As one of the most important parameters of differential privacy, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> controls the crucial tradeoff between the strength of the privacy guarantee and the utility of model. Therefore, the choice of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> has a great influence on the performance of differentially private learning models. But so far, there is still no rigorous method for choosing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>. In this paper, we deduce the influence of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> on utility private learning models through strict mathematical derivation, and propose a novel approximate approach for estimating the utility of any <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> value. We show that our approximate approach has a fairly small error and can be used to estimate the optimal <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container> according to the expected utility of users. Experimental results demonstrate high estimation accuracy and broad applicability of our approximate approach.",
+    "title": "Fast Estimation for Privacy and Utility in Differentially Private Machine Learning",
+    "authors": [
+      "Yuzhe Li",
+      "Yong Liu",
+      "Weipinng Wang",
+      "Bo Li",
+      "Nan Liu"
+    ],
+    "emails": [
+      "~Weipinng_Wang1",
+      "iie.ac.cn",
+      "~Yuzhe_Li1",
+      "~Yong_Liu7",
+      "~Bo_Li22"
+    ],
+    "rank": 2497
+  },
+  {
+    "url": "https://openreview.net/forum?id=UOOmHiXetC",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      6,
+      3,
+      6
+    ],
+    "rating": "4.24",
+    "confidences": [
+      4,
+      3,
+      2,
+      4,
+      4
+    ],
+    "abstract": "Planning in large state spaces inevitably needs to balance depth and breadth of the search. It has a crucial impact on planners performance and most manage this interplay implicitly. We present a novel method <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c28 TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D446 TEX-I\"></mjx-c><mjx-c class=\"mjx-c29 TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">Shoot Tree Search (STS)</mtext></math></mjx-assistive-mml></mjx-container>, which makes it possible to control this trade-off more explicitly. Our algorithm can be understood as an interpolation between two celebrated search mechanisms: MCTS and random shooting. It also lets the user control the bias-variance trade-off, akin to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D437 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi><mi>D</mi><mo stretchy=\"false\">(</mo><mi>n</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, but in the tree search context.\n\nIn experiments on challenging domains, we show that STS can get the best of both worlds consistently achieving higher scores. ",
+    "title": "Structure and randomness in planning and reinforcement learning",
+    "authors": [
+      "Piotr Kozakowski",
+      "Piotr Januszewski",
+      "Konrad Czechowski",
+      "\u0141ukasz Kuci\u0144ski",
+      "Piotr Mi\u0142o\u015b"
+    ],
+    "emails": [
+      "~Piotr_Kozakowski1",
+      "~Piotr_Januszewski1",
+      "~\u0141ukasz_Kuci\u0144ski1",
+      "~Piotr_Mi\u0142o\u015b1",
+      "~Konrad_Czechowski1"
+    ],
+    "rank": 2498
+  },
+  {
+    "url": "https://openreview.net/forum?id=AGQGZkLBKK",
+    "ratings": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.24",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We study continual learning in the large scale setting where tasks in the input sequence are not limited to classification, and the outputs can be of high dimension. Among multiple state-of-the-art methods, we found vanilla experience replay (ER) still very competitive in terms of both performance and scalability, despite its simplicity.\nHowever, a degraded performance is observed for ER with small memory. A further visualization of the feature space reveals that the intermediate representation undergoes a distributional drift.\nWhile existing methods usually replay only the input-output pairs, we hypothesize that their regularization effect is inadequate for complex deep models and diverse tasks with small replay buffer size. Following this observation, we propose to replay the activation of the intermediate layers in addition to the input-output pairs. Considering that saving raw activation maps can dramatically increase memory and compute cost, we propose the Compressed Activation Replay technique, where compressed representations of layer activation are saved to the replay buffer. We show that this approach can achieve superior regularization effect while adding negligible memory overhead to replay method. Experiments on both the large-scale Taskonomy benchmark with a diverse set of tasks and standard common datasets (Split-CIFAR and Split-miniImageNet) demonstrate the effectiveness of the proposed method.",
+    "title": "The Effectiveness of Memory Replay in Large Scale Continual Learning",
+    "authors": [
+      "Yogesh Balaji",
+      "Mehrdad Farajtabar",
+      "Dong Yin",
+      "Alex Mott",
+      "Ang Li"
+    ],
+    "emails": [
+      "~Ang_Li1",
+      "~Alex_Mott1",
+      "~Yogesh_Balaji1",
+      "~Dong_Yin1",
+      "~Mehrdad_Farajtabar1"
+    ],
+    "rank": 2499
+  },
+  {
+    "url": "https://openreview.net/forum?id=p7OewL0RRIH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "While federated learning allows efficient model training with local data at edge devices, two major issues that need to be resolved are: slow devices known as stragglers and malicious attacks launched by adversaries. While the presence of both stragglers and adversaries raises serious concerns for the deployment of practical federated learning systems, no known schemes or known combinations of schemes, to our best knowledge, effectively address these two issues at the same time. In this work, we propose Sself, a semi-synchronous entropy and loss based filtering/averaging, to tackle both stragglers and adversaries simultaneously. The stragglers are handled by exploiting different staleness (arrival delay) information when combining locally updated models during periodic global aggregation. Various adversarial attacks are tackled by utilizing a small amount of public data collected at the server in each aggregation step, to first filter out the model-poisoned devices using computed entropies, and then perform weighted averaging based on the estimated losses to combat data poisoning and backdoor attacks. A theoretical convergence bound is established to provide insights on the convergence of Sself. Extensive experimental results show that Sself outperforms various combinations of existing methods aiming to handle stragglers/adversaries.",
+    "title": "Sself: Robust Federated Learning against Stragglers and Adversaries",
+    "authors": [
+      "Jungwuk Park",
+      "Dong-Jun Han",
+      "Minseok Choi",
+      "Jaekyun Moon"
+    ],
+    "emails": [
+      "jejunu.ac.kr",
+      "~Jaekyun_Moon2",
+      "~Dong-Jun_Han1",
+      "kaist.ac.kr"
+    ],
+    "rank": 2500
+  },
+  {
+    "url": "https://openreview.net/forum?id=8KhxoxKP3iL",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Attention mechanisms have advanced state-of-the-art deep learning models in many machine learning tasks. Despite significant empirical gains, there is a lack of theoretical analyses on their effectiveness. In this paper, we address this problem by studying the sample complexity and loss landscape of attention-based neural networks. Our results show that, under mild assumptions, every local minimum of the attention model has low prediction error, and attention models require lower sample complexity than models without attention. Besides revealing why popular self-attention works, our theoretical results also provide guidelines for designing future attention models. Experiments on various datasets validate our theoretical findings.",
+    "title": "Analyzing Attention Mechanisms through Lens of Sample Complexity and Loss Landscape",
+    "authors": [
+      "Bingyuan Liu",
+      "Yogesh Balaji",
+      "Lingzhou Xue",
+      "Martin Renqiang Min"
+    ],
+    "emails": [
+      "~Lingzhou_Xue1",
+      "~Bingyuan_Liu2",
+      "~Yogesh_Balaji1",
+      "~Martin_Renqiang_Min1"
+    ],
+    "rank": 2501
+  },
+  {
+    "url": "https://openreview.net/forum?id=WtlM9p1bVAw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      3,
+      3
+    ],
+    "rating": "4.23",
+    "confidences": [
+      4,
+      4,
+      2,
+      3
+    ],
+    "abstract": "While many works on Continual Learning have shown promising results for mitigating catastrophic forgetting, they have relied on supervised training. To successfully learn in a label-agnostic incremental setting, a model must distinguish between learned and novel classes to properly include samples for training. We introduce a novelty detection method that leverages network confusion caused by training incoming data as a new class. We found that incorporating a class-imbalance during this detection method substantially enhances performance. The effectiveness of our approach is demonstrated across a set of common image classification benchmarks: MNIST, SVHN, CIFAR-10, and CIFAR-100.",
+    "title": "Unsupervised Class-Incremental Learning through Confusion",
+    "authors": [
+      "Shivam Khare",
+      "Kun Cao",
+      "James Matthew Rehg"
+    ],
+    "emails": [
+      "~Shivam_Khare1",
+      "~James_Matthew_Rehg1",
+      "~Kun_Cao2"
+    ],
+    "rank": 2502
+  },
+  {
+    "url": "https://openreview.net/forum?id=5IxMM3wSLDm",
+    "ratings": [
+      6,
+      3,
+      4
+    ],
+    "rating": "4.23",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "The current dominant paradigm for novelty detection relies on a learned model\u2019s capability to recover the regularities. To this end, reconstruction-based learning is often used in which the normality of an observation is expressed in how well it can be reconstructed. However, this can be limiting as anomalous data can be reconstructed well if enough common features are shared between normal and anomalous data. In this paper, we pursue an alternative approach wherein the normality is measured by a contrastive learning objective. Specifically, we propose Rotated Contrastive Predictive Coding (Rotated CPC) where the model operates on rotated images and simultaneously learns to predict the future in latent space. Normality score is thus measured as how predictive the representations are and the score\u2019s robustness is further improved by ensembling predictions on multiple rotations of the input signal. We demonstrate the efficacy of this formulation across a variety of benchmark datasets where our method outperforms state-of-the-art methods.",
+    "title": "Novelty Detection with Rotated Contrastive Predictive Coding",
+    "authors": [
+      "Dong Huk Park",
+      "Trevor Darrell"
+    ],
+    "emails": [
+      "~Trevor_Darrell2",
+      "~Dong_Huk_Park2"
+    ],
+    "rank": 2503
+  },
+  {
+    "url": "https://openreview.net/forum?id=t2C42s67gsQ",
+    "ratings": [
+      5,
+      3,
+      5
+    ],
+    "rating": "4.23",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Mini-batch SGD is a predominant optimization method in deep learning. Several works aim to improve na\u00efve random dataset sampling, which appears typically in deep learning literature, with additional prior to allow faster and better performing optimization. This includes, but not limited to, importance sampling and curriculum learning. In this work, we propose an alternative way: we think of sampling as a trainable agent and let this external model learn to sample mini-batches of training set items based on the current status and recent history of the learned model. The resulting adaptive dataset sampler, named RLSampler, is a policy network implemented with simple recurrent neural networks trained by a policy gradient algorithm. We demonstrate RLSampler on image classification benchmarks with several different learner architectures and show consistent performance gain over the originally reported scores. Moreover, either a pre-sampled sequence of indices or a pre-trained RLSampler turns out to be more effective than na\u00efve random sampling regardless of the network initialization and model architectures. Our analysis reveals the possible existence of a model-agnostic sample sequence that best represents the dataset under mini-batch SGD optimization framework.",
+    "title": "Adaptive Dataset Sampling by Deep Policy Gradient",
+    "authors": [
+      "Jaerin Lee",
+      "Kyoung Mu Lee"
+    ],
+    "emails": [
+      "~Jaerin_Lee1",
+      "~Kyoung_Mu_Lee2"
+    ],
+    "rank": 2504
+  },
+  {
+    "url": "https://openreview.net/forum?id=3nSU-sDEOG9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.23",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Appropriate credit assignment for delay rewards is a fundamental challenge in various deep reinforcement learning tasks. To tackle this problem, we introduce a delay reward calibration paradigm inspired from a classification perspective. We hypothesize that when an agent's behavior satisfies an equivalent sufficient condition to be awarded, well-represented state vectors should share similarities. To this end, we define an empirical sufficient distribution, where the state vectors within the distribution will lead agents to environmental reward signals in consequent steps. Therefore, an overfitting classifier is established to handle the distribution and generate calibrated rewards. We examine the correctness of sufficient state extraction by tracking the real-time extraction and building hybrid different reward functions in environments with different levels of awarding latency. The results demonstrate that the classifier could generate timely and accurate calibrated rewards, and the rewards could make the training more efficient. Finally, we find that the sufficient states extracted by our model resonate with observations of human cognition.",
+    "title": "Empirical Sufficiency Featuring Reward Delay Calibration",
+    "authors": [
+      "Yixuan Liu",
+      "Hu Wang",
+      "Xiaowei Wang",
+      "Xiaoyue Sun",
+      "Liuyue Jiang",
+      "Minhui Xue"
+    ],
+    "emails": [
+      "~Hu_Wang1",
+      "adelaide.edu.au",
+      "~Yixuan_Liu1",
+      "student.adelaide.edu.au"
+    ],
+    "rank": 2505
+  },
+  {
+    "url": "https://openreview.net/forum?id=JRJTVcG0f-N",
+    "ratings": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.23",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We approach the issue of robust machine vision by presenting a novel deep-learning architecture, inspired by work in theoretical neuroscience on how the primate brain performs visual 'feature binding'. Feature binding describes how separately represented features are encoded in a relationally meaningful way, such as a small edge composing part of the larger contour of an object, or the ear of a cat forming part of its head representation. We propose that the absence of such representations from current models such as convolutional neural networks might partly explain their vulnerability to small, often humanly-imperceptible changes to images known as adversarial examples. It has been proposed that adversarial examples are a result of 'off-manifold' perturbations of images, as the decision boundary is often unpredictable in these directions. Our novel architecture is designed to capture hierarchical feature binding, providing representations in these otherwise vulnerable directions. Having introduced these representations into convolutional neural networks, we provide empirical evidence of enhanced robustness against a broad range of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container>, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>L</mi><mi mathvariant=\"normal\">\u221e</mi></msub></math></mjx-assistive-mml></mjx-container> attacks in both the black-box and white-box setting on MNIST, Fashion-MNIST, and CIFAR-10. We further provide evidence, through the controlled manipulation of a key hyperparameter, synthetic data-sets, and ablation analyses, that this robustness is dependent on the introduction of the hierarchical binding representations.",
+    "title": "Hierarchical Binding in Convolutional Neural Networks Confers Adversarial Robustness",
+    "authors": [
+      "Niels Leadholm",
+      "Simon Stringer"
+    ],
+    "emails": [
+      "~Niels_Leadholm1",
+      "psy.ox.ac.uk"
+    ],
+    "rank": 2506
+  },
+  {
+    "url": "https://openreview.net/forum?id=BqC9lL-hzY_",
+    "ratings": [
+      3,
+      6,
+      3,
+      4
+    ],
+    "rating": "4.23",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "This paper aims to define, visualize, and analyze the feature complexity that is learned by a DNN. We propose a generic definition for the feature complexity. Given the feature of a certain layer in the DNN, our method disentangles and visualizes feature components of different complexity orders from the feature. The disentanglement of feature components enables us to evaluate the reliability, the effectiveness, and the significance of over-fitting of these feature components. Furthermore, such analysis helps to improve the performance of DNNs. As a generic method, the feature complexity also provides new insights into existing deep-learning techniques, such as network compression and knowledge distillation. We will release the code when the paper is accepted.",
+    "title": "Disentanglement, Visualization and Analysis of Complex Features in DNNs",
+    "authors": [
+      "Jie Ren",
+      "Mingjie Li",
+      "Zexu Liu",
+      "Quanshi Zhang"
+    ],
+    "emails": [
+      "~Quanshi_Zhang1",
+      "~Jie_Ren1",
+      "~Zexu_Liu2",
+      "~Mingjie_Li3"
+    ],
+    "rank": 2507
+  },
+  {
+    "url": "https://openreview.net/forum?id=oDPxtrCMboN",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.22",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "The success of supervised learning crucially hinges on the assumption that training data matches test data, which rarely holds in practice due to potential distribution shift. In light of this, \nmost existing methods for unsupervised domain adaptation focus on achieving domain-invariant representations and small source domain error. However, recent works have shown that this is not sufficient to guarantee good generalization on target domain and in fact is provably detrimental under label distribution shift. Furthermore, in many real-world applications it is often feasible to obtain a small amount of labeled data from the target domain and use them to facilitate model training with source data. Inspired by the above observations, in this paper we propose the first method that aims to simultaneously learn invariant representations and risks under the setting of semi-supervised domain adaptation (Semi-DA). To start with, we first give a finite sample bound for both classification and regression problems under Semi-DA. The bound suggests a principled way for target generalization by aligning both the marginal and conditional distributions across domains in feature space. Motivated by this, we then introduce our LIRR algorithm for jointly \\textbf{L}earning \\textbf{I}nvariant \\textbf{R}epresentations and \\textbf{R}isks. Finally, we conduct extensive experiments on both classification and regression tasks to demonstrate the effectiveness of LIRR. Compared with methods that only learn invariant representations or invariant risks, LIRR achieves significant improvements.",
+    "title": "Learning Invariant Representations and Risks for Semi-supervised Domain Adaptation",
+    "authors": [
+      "Bo Li",
+      "Yezhen Wang",
+      "Shanghang Zhang",
+      "Dongsheng Li",
+      "Trevor Darrell",
+      "Kurt Keutzer",
+      "Han Zhao"
+    ],
+    "emails": [
+      "~Han_Zhao1",
+      "~Dongsheng_Li2",
+      "~Kurt_Keutzer2",
+      "~Trevor_Darrell2",
+      "~Bo_Li23",
+      "~Shanghang_Zhang4",
+      "~Yezhen_Wang1"
+    ],
+    "rank": 2508
+  },
+  {
+    "url": "https://openreview.net/forum?id=oAkujcqxJzW",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.22",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Flexibility design problems are a class of problems that appear in strategic decision-making across industries, where the objective is to design a (e.g., manufacturing) network that affords flexibility and adaptivity. The underlying combinatorial nature and stochastic objectives make flexibility design problems challenging for standard optimization methods. In this paper, we develop a reinforcement learning (RL) framework for flexibility design problems. Specifically, we carefully design mechanisms with noisy exploration and variance reduction to ensure empirical success and show the unique advantage of RL in terms of fast-adaptation. Empirical results show that the RL-based method consistently finds better solutions compared to classical heuristics.",
+    "title": "Reinforcement Learning for Flexibility Design Problems",
+    "authors": [
+      "Yehua Wei",
+      "Lei Zhang",
+      "Ruiyi Zhang",
+      "Shijing Si",
+      "Hao Zhang",
+      "Lawrence Carin"
+    ],
+    "emails": [
+      "fmr.com",
+      "duke.edu",
+      "~Yehua_Wei2",
+      "~Ruiyi_Zhang3",
+      "~Lawrence_Carin2",
+      "med.cornell.edu"
+    ],
+    "rank": 2509
+  },
+  {
+    "url": "https://openreview.net/forum?id=ymQ5aKjnfEh",
+    "ratings": [
+      6,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.22",
+    "confidences": [
+      1,
+      3,
+      2,
+      3
+    ],
+    "abstract": "We examine the hypothesis that the concept of symmetry augmentation is fundamentally linked to learning. Our focus in this study is on the augmentation of symmetry embedded in 1-dimensional time series (1D-TS). Motivated by the duality between 1D-TS and networks, we augment the symmetry by converting 1D-TS into three 2-dimensional representations: temporal correlation (GAF), transition dynamics (MTF), and recurrent events (RP). This conversion does not require a priori knowledge of the types of symmetries hidden in the 1D-TS. We then exploit the equivariance property of CNNs to learn the hidden symmetries in the augmented 2-dimensional data. We show that such conversion only increases the amount of symmetry, which may lead to more efficient learning. Specifically, we prove that a direct sum based augmentation will never decrease the amount of symmetry. We also attempt to measure the amount of symmetry in the original 1D-TS and augmented representations using the notion of persistent homology, which reveals symmetry increase after augmentation quantitatively. We present empirical studies to confirm our findings using two cases: reinforcement learning for financial portfolio management and classification with the CBF data set. Both cases demonstrate significant improvements in learning efficiency.",
+    "title": "Symmetry-Augmented Representation for Time Series",
+    "authors": [
+      "Amine Mohamed Aboussalah",
+      "Chi-Guhn Lee"
+    ],
+    "emails": [
+      "~Chi-Guhn_Lee1",
+      "~Amine_Mohamed_Aboussalah1"
+    ],
+    "rank": 2510
+  },
+  {
+    "url": "https://openreview.net/forum?id=O9NAKC_MqMx",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.22",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Neural network pruning reduces the computational cost of an over-parameterized network to improve its efficiency. Popular methods vary from <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>\u2113</mi><mn>1</mn></msub></math></mjx-assistive-mml></mjx-container>-norm sparsification to Neural Architecture Search (NAS).  In this work, we propose a novel pruning method that optimizes the final accuracy of the pruned network and distills knowledge from the over-parameterized parent network's inner layers. To enable this approach, we formulate the network pruning as a Knapsack Problem which optimizes the trade-off between the importance of neurons and their associated computational cost. Then we prune the network channels while maintaining the high-level structure of the network. The pruned network is fine-tuned under the supervision of the parent network using its inner network knowledge, a technique we refer to as the {\\it Inner Knowledge Distillation}. Our method leads to state-of-the-art pruning results on ImageNet, CIFAR-10 and CIFAR-100 using ResNet backbones. \nTo prune complex network structures such as convolutions with skip-links and depth-wise convolutions, we propose a block grouping approach to cope with these structures.\nThrough this we produce compact architectures with the same FLOPs as EfficientNet-B0 and MobileNetV3 but with higher accuracy, by <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.3</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> respectively on ImageNet, and faster runtime on GPU.",
+    "title": "Knapsack Pruning with Inner Distillation",
+    "authors": [
+      "Yonathan Aflalo",
+      "Itamar Friedman",
+      "Asaf Noy",
+      "Lihi Zelnik-Manor",
+      "Ming Lin"
+    ],
+    "emails": [
+      "~Ming_Lin4",
+      "~Yonathan_Aflalo2",
+      "alibaba-inc.com",
+      "~Lihi_Zelnik-Manor1"
+    ],
+    "rank": 2511
+  },
+  {
+    "url": "https://openreview.net/forum?id=pD9x3TmLONE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.22",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Transferring knowledge from large source datasets is an effective way to fine-tune the deep neural networks of the target task with a small sample size. A great number of algorithms have been proposed to facilitate deep transfer learning, and these techniques could be generally categorized into two groups \u2013 Regularized Learning of the target task using models that have been pre-trained from source datasets, and Multitask Learning with both source and target datasets to train a shared backbone neural network. In this work, we aim to improve the multitask paradigm for deep transfer learning via Cross-domain Mixup (XMixup). While the existing multitask learning algorithms need to run backpropagation over both the source and target datasets and usually consume a higher gradient complexity, XMixup transfers the knowledge from source to target tasks more efficiently: for every class of the target task, XMixup selects the auxiliary samples from the source dataset and augments training samples via the simple mixup strategy. We evaluate XMixup over six real world transfer learning datasets. Experiment results show that XMixup improves the accuracy by 1.9% on average. Compared with other state-of-the-art transfer learning approaches, XMixup costs much less training time while still obtains higher accuracy.",
+    "title": "XMixup: Efficient Transfer Learning with Auxiliary Samples by Cross-Domain Mixup",
+    "authors": [
+      "Xingjian Li",
+      "Haoyi Xiong",
+      "Haozhe An",
+      "Cheng-zhong Xu",
+      "Dejing Dou"
+    ],
+    "emails": [
+      "~Xingjian_Li1",
+      "~Haozhe_An1",
+      "~Haoyi_Xiong1",
+      "~Cheng-zhong_Xu1",
+      "~Dejing_Dou1"
+    ],
+    "rank": 2512
+  },
+  {
+    "url": "https://openreview.net/forum?id=yRP4_BOxdu",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.21",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "We consider hierarchical Bayesian (type-II maximum likelihood) models for observations with latent variables for source and noise, where both hyperparameters need to be estimated jointly from data. This problem has application in many domains in imaging including biomagnetic inverse problems. Crucial factors influencing accuracy of source estimation are not only the noise level but also its correlation structure, but existing approaches have not addressed estimation of noise covariance matrices with full structure. Here, we consider the reconstruction of brain activity from electroencephalography (EEG). This inverse problem can be formulated as a linear regression with independent Gaussian scale mixture priors for both the source and noise components. As a departure from  classical sparse Bayesan learning (SBL) models where across-sensor observations are assumed to be independent and identically distributed, we consider Gaussian noise with full covariance structure. Using  Riemannian geometry, we derive an efficient algorithm for updating both source and noise covariance along the manifold of positive definite matrices. Using the majorization-maximization framework, we demonstrate that our algorithm has guaranteed and fast convergence. We validate the algorithm both in simulations and with real data. Our results demonstrate that the novel framework significantly improves upon state-of-the-art techniques in the real-world scenario where the noise is indeed non-diagonal and fully-structured.",
+    "title": "Joint Learning of Full-structure Noise in Hierarchical Bayesian Regression Models",
+    "authors": [
+      "Ali Hashemi",
+      "Chang Cai",
+      "Klaus Robert Muller",
+      "Srikantan Nagarajan",
+      "Stefan Haufe"
+    ],
+    "emails": [
+      "~Klaus_Robert_Muller1",
+      "charite.de",
+      "~Srikantan_Nagarajan2",
+      "gmail.com",
+      "~Ali_Hashemi1"
+    ],
+    "rank": 2513
+  },
+  {
+    "url": "https://openreview.net/forum?id=UMSKPHFlnjD",
+    "ratings": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "rating": "4.21",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "RoIPool/RoIAlign is an indispensable process for the typical two-stage object detection algorithm, it is used to rescale the object proposal cropped from the feature pyramid to generate a fixed size feature map. However, these cropped feature maps of local receptive fields will heavily lose global context information. To tackle this problem, in this paper, we propose a novel end-to-end trainable framework, called Dense Global Context Aware (DGCA) RCNN, aiming at assisting the neural network in strengthening the spatial correlation between the background and the foreground by fusing global context information. The core component of our DGCA framework is a context aware mechanism, in which both global feature pyramid and attention strategies are used for feature extraction and feature refinement, respectively. Specifically, we leverage the dense connection to fuse the global context information of different stages in the top-down process of FPN, and further leverage the attention mechanism to perform global context aware. Thus, the implicit relationship between object proposal and global features can be captured by neural networks to improve detection performance. Experimental results on COCO benchmark dataset demonstrate the significant advantages of our approach.\n",
+    "title": "Dense Global Context Aware RCNN for Object Detection",
+    "authors": [
+      "Wenchao Zhang",
+      "Haoyu Xie",
+      "Mai Zhu",
+      "Chong Fu"
+    ],
+    "emails": [
+      "~Wenchao_Zhang1",
+      "~Mai_Zhu1",
+      "~Haoyu_Xie2",
+      "~Chong_Fu1"
+    ],
+    "rank": 2514
+  },
+  {
+    "url": "https://openreview.net/forum?id=3u3ny6UYmjy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.21",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Retrosynthesis, of which the goal is to find a set of reactants for synthesizing a target product, is an emerging research area of deep learning. While the existing approaches have shown promising results, they currently lack the ability to consider availability (e.g., stability or purchasability) of the reactants or generalize to unseen reaction templates (i.e., chemical reaction rules).\nIn this paper, we propose a new approach that mitigates the issues by reformulating retrosynthesis into a selection problem of reactants from a candidate set of commercially available molecules. To this end, we design an efficient reactant selection framework, named RetCL (retrosynthesis via contrastive learning), for enumerating all of the candidate molecules based on selection scores computed by graph neural networks. For learning the score functions, we also propose a novel contrastive training scheme with hard negative mining. Extensive experiments demonstrate the benefits of the proposed selection-based approach. For example, when all 671k reactants in the USPTO database are given as candidates, our RetCL achieves top-1 exact match accuracy of 71.3% for the USPTO-50k benchmark, while a recent transformer-based approach achieves 59.6%. We also demonstrate that RetCL generalizes well to unseen templates in various settings in contrast to template-based approaches. The code will be released.",
+    "title": "RetCL: A Selection-based Approach for Retrosynthesis via Contrastive Learning",
+    "authors": [
+      "Hankook Lee",
+      "Sungsoo Ahn",
+      "Seung-Woo Seo",
+      "You Young Song",
+      "Eunho Yang",
+      "Sung Ju Hwang",
+      "Jinwoo Shin"
+    ],
+    "emails": [
+      "~Seung-Woo_Seo3",
+      "~Eunho_Yang1",
+      "~Hankook_Lee1",
+      "~Jinwoo_Shin1",
+      "~You_Young_Song1",
+      "~Sungsoo_Ahn1",
+      "~Sung_Ju_Hwang1"
+    ],
+    "rank": 2515
+  },
+  {
+    "url": "https://openreview.net/forum?id=qkJevgi2Fp3",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.21",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Recently, a new recurrent neural network (RNN) named the Legendre Memory Unit (LMU) was proposed and shown to achieve state-of-the-art performance on psMNIST and other datasets. Here we consider a modified version of the LMU, named ff-LMU, the core of which is a linear time-invariant (LTI) system.  We first show that the ff-LMU can be trained in a purely feedforward manner and yet executed during inference in a recurrent fashion. Specifically we demonstrate that it trains about 80x faster than LSTM models of the same size.  As a result, it overcomes the well-known limitations of training RNNs on GPUs that make them less scalable than feedforward networks like transformers. Second, to validate its utility, we compare ff-LMU performance against LSTMs on five benchmarks picked from the following categories: sentiment classification, semantic similarity, natural language inference, and image classification. Our models, despite their simplicity, achieve new state-of-the-art results for RNNs on psMNIST and QQP, and exhibit superior performance on the remaining three datasets while using up to 1000x fewer parameters. In general, ff-LMU models are highly parameter efficient. For instance, the first model that beats it on current leaderboards for QQP is a transformer that uses 50,000x more parameters.",
+    "title": "Feedforward Legendre Memory Unit",
+    "authors": [
+      "Narsimha Reddy Chilkuri",
+      "Chris Eliasmith"
+    ],
+    "emails": [
+      "~Chris_Eliasmith1",
+      "~Narsimha_Reddy_Chilkuri1"
+    ],
+    "rank": 2516
+  },
+  {
+    "url": "https://openreview.net/forum?id=_PzOsP37P4T",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.21",
+    "confidences": [
+      3,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Estimating the gradients of stochastic nodes, which enables the gradient descent optimization on neural network parameters, is one of the crucial research questions in the deep generative modeling community. When it comes to discrete distributions, Gumbel-Softmax trick reparameterizes Bernoulli and categorical random variables by continuous relaxation. However, gradient estimators of discrete distributions other than the Bernoulli and the categorical have not been explored, and the the Gumbel-Softmax trick is not directly applicable to other discrete distributions. This paper proposes a general version of the Gumbel-Softmax estimator with a theoretical basis, and the proposed estimator is able to reparameterize generic discrete distributions, broader than the Bernoulli and the categorical. In detail, we utilize the truncation of discrete random variables and the Gumbel-Softmax trick with a linear transformation for the relaxed reparameterization. The proposed approach enables the relaxed discrete random variable to be reparameterized through a large-scale stochastic computational graph. Our experiments consist of (1) synthetic data analyses and applications on VAE, which show the efficacy of our methods; and (2) topic models, which demonstrate the value of the proposed estimation in practice.",
+    "title": "Generalized Gumbel-Softmax Gradient Estimator for Generic Discrete Random Variables",
+    "authors": [
+      "Weonyoung Joo",
+      "Dongjun Kim",
+      "Seungjae Shin",
+      "Il-chul Moon"
+    ],
+    "emails": [
+      "~Il-chul_Moon1",
+      "~Weonyoung_Joo1",
+      "~Seungjae_Shin1",
+      "~Dongjun_Kim1"
+    ],
+    "rank": 2517
+  },
+  {
+    "url": "https://openreview.net/forum?id=2Ey_1FeNtOC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      3
+    ],
+    "rating": "4.21",
+    "confidences": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "abstract": "Recurrent neural networks (RNNs) face two well-known challenges: (a) the difficulty of such networks to generalize appropriately as opposed to memorizing, especially from very short input sequences (generalization); and (b) the difficulty for us to understand the knowledge that the network has attained (transparency). We explore the implications to these challenges of employing a general search through neural architectures using a genetic algorithm with Minimum Description Length (MDL) as an objective function. We find that MDL leads the networks to reach adequate levels of generalization from very small corpora, improving over backpropagation-based alternatives. We demonstrate this approach by evolving networks which perform tasks of increasing complexity with absolute correctness. The resulting networks are small, easily interpretable, and unlike classical RNNs, are provably appropriate for sequences of arbitrary length even when trained on very limited corpora. One case study is addition, for which our system grows a network with just four cells, reaching 100% accuracy (and at least .999 certainty) for arbitrary large numbers.",
+    "title": "Minimum Description Length Recurrent Neural Networks",
+    "authors": [
+      "Nur Lan",
+      "Emmanuel Chemla",
+      "Roni Katzir"
+    ],
+    "emails": [
+      "mail.tau.ac.il",
+      "ens.fr",
+      "~Roni_Katzir1"
+    ],
+    "rank": 2518
+  },
+  {
+    "url": "https://openreview.net/forum?id=GCXq4UHH7h4",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.21",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Designing an on-sensor data dimensionality reduction scheme for efficient signal sensing has always been a challenging task. Compressive sensing is a state-of-the-art sensing technique used for on-sensor data dimensionality reduction. However, the undesired computational complexity involved in the sensing stage of compressive sensing limits its practical application in resource-constrained sensor devices or high-data-rate sensor devices dealing with high-dimensional signals. In this paper, we propose a selective sensing framework that adopts the novel concept of data-driven nonuniform subsampling to reduce the dimensionality of acquired signals while retaining the information of interest in a computation-free fashion. Selective sensing adopts a co-optimization methodology to co-train a selective sensing operator with a subsequent information decoding neural network. We take image as the sensing modality and reconstruction as the information decoding task to demonstrate the 1st proof-of-concept of selective sensing. The experiment results on CIFAR10, Set5 and Set14 datasets show that selective sensing can achieve an average reconstruction accuracy improvement in terms of PSNR/SSIM by 3.73dB/0.07 and 9.43dB/0.16 over compressive sensing and uniform subsampling counterparts across the compression ratios of 4-32x, respectively. Source code is available at <a href=\"https://figshare.com/s/519a923fae8f386d7f5b\" target=\"_blank\" rel=\"nofollow\">https://figshare.com/s/519a923fae8f386d7f5b</a>",
+    "title": "Selective Sensing: A Data-driven Nonuniform Subsampling Approach for Computation-free On-Sensor Data Dimensionality Reduction",
+    "authors": [
+      "Zhikang Zhang",
+      "Kai Xu",
+      "Fengbo Ren"
+    ],
+    "emails": [
+      "asu.edu",
+      "~Fengbo_Ren1",
+      "~Zhikang_Zhang1"
+    ],
+    "rank": 2519
+  },
+  {
+    "url": "https://openreview.net/forum?id=mLtPtH2SIHX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.21",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Mixup and its variants have promoted a surge of interest due to their capability of boosting the accuracy of deep  models. For a random sample pair, such approaches generate a set of synthetic samples through interpolating both the  inputs and their corresponding one-hot labels. Current methods either interpolate random features from an input pair or learn to mix salient features  from the  pair. Nevertheless, the former methods can create misleading synthetic    samples or remove important features from the given inputs, and the latter strategies incur significant computation cost for selecting descriptive input regions. In this paper, we show that the effort needed for the input mixing can be bypassed. For a given sample pair, averaging the features from the two inputs and then assigning it with a set of  soft labels can  effectively regularize the training. We empirically show that the proposed  approach performs on par with state-of-the-art strategies in terms of predictive accuracy. ",
+    "title": "Bypassing the Random Input Mixing in Mixup",
+    "authors": [
+      "Hongyu Guo"
+    ],
+    "emails": [
+      "~Hongyu_Guo1"
+    ],
+    "rank": 2520
+  },
+  {
+    "url": "https://openreview.net/forum?id=IqVB8e0DlUd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.21",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "The techniques based on the theory of differential privacy (DP) has become a standard building block in the machine learning community. DP training mechanisms offer strong guarantees that an adversary cannot determine with high confidence about the training data based on analyzing the released model, let alone any details of the instances. However, DP may disproportionately affect the underrepresented and relatively complicated classes. That is, the reduction in utility is unequal for each class. This paper proposes a fair differential privacy algorithm (FairDP) to mitigate the disparate impact on model accuracy for each class. We cast the learning procedure as a two-stage optimization problem, which integrates differential privacy with fairness. FairDP establishes a self-adaptive DP mechanism and dynamically adjusts instance influence in each class depending on the theoretical bias-variance bound. Our experimental evaluation shows the effectiveness of FairDP in mitigating the disparate impact on model accuracy among the classes on several benchmark datasets and scenarios ranging from text to vision.",
+    "title": "Fair Differential Privacy Can Mitigate the Disparate Impact on Model Accuracy",
+    "authors": [
+      "Wenyan Liu",
+      "Xiangfeng Wang",
+      "Xingjian Lu",
+      "Junhong Cheng",
+      "Bo Jin",
+      "Xiaoling Wang",
+      "Hongyuan Zha"
+    ],
+    "emails": [
+      "cs.ecnu.edu.cn",
+      "~Hongyuan_Zha1",
+      "~Wenyan_Liu1",
+      "~Bo_Jin1",
+      "stu.ecnu.edu.cn",
+      "~Xiangfeng_Wang1"
+    ],
+    "rank": 2521
+  },
+  {
+    "url": "https://openreview.net/forum?id=-BA38x6Cf2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      5,
+      2
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Flow-based generative models refer to deep generative models with \ntractable likelihoods, and offer several attractive properties including \nefficient density estimation and sampling.  Despite many advantages, \ncurrent formulations (e.g., normalizing flow) often have an expensive memory/runtime footprint, which hinders their use in a number of applications. \nIn this paper, we consider the setting where we have access to an autoencoder, which is\nsuitably effective for the dataset of interest. Under some mild conditions,\nwe show that we can calculate a mapping to a RKHS which subsequently enables deploying \nmature ideas from the kernel methods literature for flow-based generative models. Specifically, we can explicitly map the RKHS distribution (i.e., \napproximate the flow) to match or align with  \na template/well-characterized distribution, via kernel transfer operators. This leads to a direct and resource efficient approximation avoiding iterative optimization. We empirically show that this simple idea yields competitive results on popular datasets such as CelebA,\nas well as promising results on a public 3D brain imaging dataset where the sample sizes are much smaller. ",
+    "title": "Can Kernel Transfer Operators Help Flow based Generative Models?",
+    "authors": [
+      "Zhichun Huang",
+      "Rudrasis Chakraborty",
+      "Xingjian Zhen",
+      "Vikas Singh"
+    ],
+    "emails": [
+      "~Zhichun_Huang1",
+      "~Vikas_Singh1",
+      "~Rudrasis_Chakraborty1",
+      "~Xingjian_Zhen1"
+    ],
+    "rank": 2522
+  },
+  {
+    "url": "https://openreview.net/forum?id=oeHTRAehiFF",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Many Question Answering (QA) tasks have been studied in NLP and employed to evaluate the progress of machine intelligence. One kind of QA tasks, such as Machine Reading Comprehension QA,  is well solved by end-to-end neural networks; another kind of QA tasks, such as Knowledge Base QA,  needs to be translated to a formatted representations and then solved by a well-designed solver. We notice that some real-world QA tasks are more complex, which cannot be solved by end-to-end neural networks or translated to any kind of formal representations. To further stimulate the research of QA and development of QA techniques, in this work, we create a new and complex QA dataset, ChemistryQA,  based on real-world chemical calculation questions. To answer chemical questions, machines need to understand questions, apply chemistry and Math knowledge, and do calculation and reasoning. To help researchers ramp up, we build two baselines: the first one is BERT-based sequence to sequence model, and the second one is an extraction system plus a graph search based solver. These two methods achieved 0.164 and 0.169 accuracy on the development set, respectively, which clearly demonstrate that new techniques are needed for complex QA tasks. ChemistryQA dataset will be available for public download once the paper is published.",
+    "title": "ChemistryQA: A Complex Question Answering Dataset from Chemistry",
+    "authors": [
+      "Zhuoyu Wei",
+      "Wei Ji",
+      "Xiubo Geng",
+      "Yining Chen",
+      "Baihua Chen",
+      "Tao Qin",
+      "Daxin Jiang"
+    ],
+    "emails": [
+      "microsoft.com",
+      "~Tao_Qin1",
+      "~Zhuoyu_Wei1"
+    ],
+    "rank": 2523
+  },
+  {
+    "url": "https://openreview.net/forum?id=pD7kGiAQkNY",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Weight decay is a widely used technique for training Deep Neural Networks(DNN). It greatly affects generalization performance, but the underlying mechanisms are not fully understood. Recent works show that for layers followed by normalizations, weight decay mainly affects the \\emph{effective learning rate}. However, although normalizations have been extensively adopted in modern DNNs, layers such as the final fully-connected layer do not satisfy this precondition. For these layers, the effects of weight decay are still unclear. In this paper, we comprehensively investigate the mechanisms of weight decay and find that except for influencing effective learning rate, weight decay has another distinct mechanism that is equally important: affecting generalization performance by controlling \\emph{cross-boundary risk}. These two mechanisms together give a more comprehensive explanation for the effects of weight decay. Based on this discovery, we propose a new training method called \\textbf{FixNorm}, which discards weight decay and directly controls the two mechanisms. We also propose a practical method to tune hyperparameters of FixNorm, finding near-optimal solutions 2<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>3 times faster than Bayesian Optimization. On ImageNet classification task, training EfficientNet-B0 with FixNorm achieves 77.7\\%, which outperforms the original baseline by a clear margin. Surprisingly, when scaling MobileNetV2 to the same FLOPS and applying the same tricks with EfficientNet-B0, training with FixNorm achieves 77.4\\%, which shows the importance of well-tuned training procedures and further verifies the effectiveness of our approach. We set up more well-tuned baselines using FixNorm, to facilitate fair comparisons in the community.",
+    "title": "FixNorm: Dissecting Weight Decay for Training Deep Neural Networks",
+    "authors": [
+      "Yucong Zhou",
+      "Yunxiao Sun",
+      "Jian Zhang",
+      "Zhao Zhong"
+    ],
+    "emails": [
+      "~Zhao_Zhong1",
+      "~Jian_Zhang18",
+      "~Yucong_Zhou2",
+      "~Yunxiao_Sun1"
+    ],
+    "rank": 2524
+  },
+  {
+    "url": "https://openreview.net/forum?id=pbkSuhxdnZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Compression technologies for deep neural networks (DNNs), such as weight quantization, have been widely investigated to reduce the DNN model size so that they can be implemented on hardware with strict resource restrictions. However, one major downside of model compression is accuracy degradation. To deal with this problem effectively, we propose a new compressed network inference scheme, with a high accuracy but slower DNN coupled with its highly compressed DNN version that typically delivers much faster inference speed but with a lower accuracy. During inference, we determine the confidence of the prediction of the compressed DNN, and infer the original DNN for the inputs that are considered not confident by the compressed DNN. The proposed design can deliver overall accuracy close to the high accuracy model, but with the latency closer to the compressed DNN. We demonstrate our design on two image classification tasks: CIFAR-10 and ImageNet. Our experiments show that our design can recover up to 94% of accuracy drop caused by extreme network compression, with more than 90% increase in throughput compared to just using the original DNN. This is the first work that considers using a highly compressed DNN along with the original DNN in parallel to improve latency significantly while effectively maintaining the original model accuracy. ",
+    "title": "TwinDNN: A Tale of Two Deep Neural Networks",
+    "authors": [
+      "Hyunmin Jeong",
+      "Deming Chen"
+    ],
+    "emails": [
+      "~Hyunmin_Jeong1",
+      "~Deming_Chen1"
+    ],
+    "rank": 2525
+  },
+  {
+    "url": "https://openreview.net/forum?id=BHBb-QVVkNS",
+    "ratings": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "In natural language processing, deep learning methods are popular for sequence labelling tasks but training them usually requires large amounts of labelled data. Active learning can reduce the amount of labelled training data required by iteratively acquiring labels for the data points a model is most uncertain about. However, active learning methods usually use supervised training and ignore the data points which have not yet been labelled. We propose an approach to sequence labelling using active learning which incorporates both labelled and unlabelled data. We train a locally-contextual conditional random field with deep nonlinear potentials in a semi-supervised manner, treating the missing labels of the unlabelled sentences as latent variables. Our semi-supervised active learning method is able to leverage the sentences which have not yet been labelled to improve on the performance of purely supervised active learning. We also find that using an additional, larger pool of unlabelled data provides further improvements. Across a variety of sequence labelling tasks, our method is consistently able to match 97% of the performance of state of the art models while using less than 30% of the amount of training data.",
+    "title": "Efficiently labelling sequences using semi-supervised active learning",
+    "authors": [
+      "Harshil Shah",
+      "David Barber"
+    ],
+    "emails": [
+      "~David_Barber1",
+      "~Harshil_Shah1"
+    ],
+    "rank": 2526
+  },
+  {
+    "url": "https://openreview.net/forum?id=F_txysyDFbw",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We study neural-linear bandits for solving problems where both exploration and representation learning play an important role. Neural-linear bandits leverage the representation power of deep neural networks and combine it with efficient exploration mechanisms, designed for linear contextual bandits, on top of the last hidden layer. Since the representation is optimized during learning, information regarding exploration with \u201cold\u201d features is lost. We propose the first limited memory neural- linear bandit that is resilient to this catastrophic forgetting phenomenon by solving a semi-definite program. We then approximate the semi-definite program using stochastic gradient descent to make the algorithm practical and adjusted for online usage. We perform simulations on a variety of data sets, including regression, classification, and sentiment analysis. In addition, we evaluate our algorithm in a challenging uplink rate-control application. The bandit controls the transmission rates of data segments over cellular links to achieve optimal throughput. We observe that our algorithm achieves superior performance and shows resilience to catastrophic forgetting.",
+    "title": "Online Limited Memory Neural-Linear Bandits",
+    "authors": [
+      "Tom Zahavy",
+      "Ofir Nabati",
+      "Leor Cohen",
+      "Shie Mannor"
+    ],
+    "emails": [
+      "~Tom_Zahavy2",
+      "~Ofir_Nabati1",
+      "gmail.com",
+      "~Shie_Mannor2"
+    ],
+    "rank": 2527
+  },
+  {
+    "url": "https://openreview.net/forum?id=nXSDybDWV3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.20",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Stein Variational Inference is a technique for approximate Bayesian inferencethat is recently gaining popularity since it combines the scalability of traditionalVariational Inference (VI) with the flexibility of non-parametric particle basedinference methods.  While there has been considerable progress in developmentof algorithms, integration in existing probabilistic programming languages (PPLs)with an easy-to-use interface is currently lacking.  EinStein VI is a lightweightcomposable library that integrates the latest Stein Variational Inference methodswith the NumPyro PPL. Inference with EinStein VI relies on ELBO-within-Stein tosupport use of custom inference programs (guides), non-linear scaling of repulsionforce, second-order gradient updates using matrix-valued kernels and parametertransforms. We demonstrate the achieved synergy of the different Stein techniquesand the versatility of EinStein VI library by applying it on examples. Comparedto traditional Stochastic VI, EinStein VI is better at capturing uncertainty andrepresenting richer posteriors. We use several applications to show how one canuse Neural Transforms (NeuTra) and second-order optimization to provide betterinference using EinStein VI. We show how EinStein VI can be used to infer theparameters of a Latent Dirichlet Allocation model with a neural guide. The resultsindicate that Einstein VI can be combined with NumPyro\u2019s support for automaticmarginalization to do inference over models with discrete latent variables. Finally,we introduce an example with a novel extension to Deep Markov Models, calledthe Stein Mixture Deep Markov Model (SM-DMM), which shows that EinStein VIcan be scaled to reasonably large models with over 500.000 parameters",
+    "title": "Einstein VI:   General and Integrated Stein Variational Inference in NumPyro",
+    "authors": [
+      "Ahmad Salim Al-Sibahi",
+      "Ola R\u00f8nning",
+      "Christophe Ley",
+      "Thomas Wim Hamelryck"
+    ],
+    "emails": [
+      "di.ku.dk",
+      "~Ahmad_Salim_Al-Sibahi1",
+      "binf.ku.dk",
+      "ugent.be"
+    ],
+    "rank": 2528
+  },
+  {
+    "url": "https://openreview.net/forum?id=_8EQ_gMAHFy",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.20",
+    "confidences": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "abstract": "In this paper, we present a novel physics-rooted network structure that dramatically facilitates the learning of complex dynamic systems. Our method is inspired by the Vortex Method in fluid dynamics, whose key idea lies in that, given the observed flow field, instead of describing it with a function of space and time, one can equivalently understand the observation as being caused by a number of Lagrangian particles ----- vortices, flowing with the field.  Since the number of such vortices are much smaller than that of the Eulerian, grid discretization, this Lagrangian discretization in essence encodes the system dynamics on a compact physics-based latent space. Our method enforces such Lagrangian discretization with a Encoder---Dynamics---Decode network structure, and trains it with a novel three-stage curriculum learning algorithm. With data generated from the high precision Eulerian DNS method, our alorithm takes advantage of the simplifying power of the Lagrangian method while persisting the physical integrity.\nThis method fundamentally differs from the current approaches in the field of physics-informed learning, and provides superior results for being more versatile, yielding more physical-correctness with less data sample, and faster to compute at high precision. Beyond providing a viable way of simulating complex fluid at high-precision, our method opens up a brand new horizon for embedding knowledge prior via constructing physically-valid latent spaces, which can be applied to further research areas beyond physical simulation.\n",
+    "title": "VortexNet: Learning Complex Dynamic Systems with Physics-Embedded Networks",
+    "authors": [
+      "Shiying Xiong",
+      "Xingzhe He",
+      "Yunjin Tong",
+      "Yitong Deng",
+      "Bo Zhu"
+    ],
+    "emails": [
+      "dartmouth.edu",
+      "~Bo_Zhu2",
+      "~Shiying_Xiong1",
+      "~Xingzhe_He1"
+    ],
+    "rank": 2529
+  },
+  {
+    "url": "https://openreview.net/forum?id=HeEzgm-f4g1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Batch size is an important hyper-parameter for training deep learning models with stochastic gradient decent (SGD) method, and it has great influence on the training time and model performance. We study the batch size selection problem for training graph neural network (GNN) with SGD method. \nTo reduce the training time while keeping a decent model performance, we propose a metric that combining both the variance of gradients and compute time for each mini-batch. We theoretically analyze how batch-size influence such a metric and propose the formula to evaluate some rough range of optimal batch size. \nIn GNN, gradients evaluated on samples in a mini-batch are not independent and it is challenging to evaluate the exact variance of gradients. To address the dependency, we analyze an estimator for gradients that considers the randomness arising from two consecutive layers in GNN, and suggest a guideline for picking the appropriate scale of the batch size. \nWe complement our theoretical results with extensive empirical experiments for ClusterGCN, FastGCN and GraphSAINT on 4 datasets: Ogbn-products, Ogbn-arxiv, Reddit and Pubmed. We demonstrate that in contrast to conventional deep learning models, GNNs benefit from large batch sizes.",
+    "title": "On Batch-size Selection for Stochastic Training for Graph Neural Networks",
+    "authors": [
+      "Yaochen Hu",
+      "Amit Levi",
+      "Ishaan Kumar",
+      "Yingxue Zhang",
+      "Mark Coates"
+    ],
+    "emails": [
+      "huawei.com",
+      "~Yaochen_Hu1",
+      "~Mark_Coates1"
+    ],
+    "rank": 2530
+  },
+  {
+    "url": "https://openreview.net/forum?id=T58qDGccG56",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Multi-branch architectures are widely used  in state-of-the-art neural networks. Their empirical success relies on some design wisdom, like adding normalization layers or/and scaling down the initialization.  In this paper, we investigate the multi-branch architecture from the stability perspective. Specifically, we establish the forward/backward stability of multi-branch network, which leads to several new findings. Our analysis shows that only scaling down the initialization may not be enough for training multi-branch network successfully because of the uncontrollable backward process. We also unveil a new role of the normalization layer in terms of stabilizing the multi-branch architectures. More importantly, we propose a new design ``STAM aggregation\" that can guarantee to STAbilize the forward/backward process of Multi-branch networks irrespective of the number of branches. We demonstrate that with STAM aggregation, the same training strategy is applicable to models with different numbers of branches, which can reduce the hyper-parameter tuning burden. Our experiments verify our theoretical findings and also demonstrate that the STAM aggregation can improve the performance of multi-branch networks considerably.",
+    "title": "On the Stability of Multi-branch Network",
+    "authors": [
+      "Huishuai Zhang",
+      "Da Yu",
+      "Wei Chen",
+      "Tie-Yan Liu"
+    ],
+    "emails": [
+      "~Wei_Chen1",
+      "~Da_Yu1",
+      "~Huishuai_Zhang3",
+      "~Tie-Yan_Liu1"
+    ],
+    "rank": 2531
+  },
+  {
+    "url": "https://openreview.net/forum?id=9wHe4F-lpp",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      5,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      5,
+      2,
+      4,
+      5
+    ],
+    "abstract": "Binary neural networks (BNNs), where both weights and activations are binarized into 1 bit, have been widely studied in recent years due to its great benefit of highly accelerated computation and substantially reduced memory footprint that appeal to the development of resource constrained devices. In contrast to previous methods tending to reduce the quantization error for training BNN structures, we argue that the binarized convolution process owns an increasing linearity towards the target of minimizing such error, which in turn hampers BNN's discriminative ability. In this paper, we re-investigate and tune proper non-linear modules to fix that contradiction, leading to a strong baseline which achieves state-of-the-art performance on the large-scale ImageNet dataset in terms of accuracy and training efficiency. To go further, we find that the proposed BNN model still has much potential to be compressed by making a better use of the efficient binary operations, without losing accuracy. In addition, the limited capacity of the BNN model can also be increased with the help of group execution. Based on these insights, we are able to improve the baseline with an additional  4<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo></math></mjx-assistive-mml></mjx-container>5% top-1 accuracy gain even with less computational cost. Our code and all trained models will be made public.",
+    "title": "FTBNN: Rethinking Non-linearity for 1-bit CNNs and Going Beyond",
+    "authors": [
+      "Zhuo Su",
+      "Linpu Fang",
+      "Deke Guo",
+      "Dewen Hu",
+      "Matti Pietik\u00e4inen",
+      "Li Liu"
+    ],
+    "emails": [
+      "~Zhuo_Su2",
+      "~Matti_Pietik\u00e4inen2",
+      "~Deke_Guo1",
+      "~Li_Liu9",
+      "~Dewen_Hu2",
+      "~Linpu_Fang1"
+    ],
+    "rank": 2532
+  },
+  {
+    "url": "https://openreview.net/forum?id=h8q8iZi-ks",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      3
+    ],
+    "rating": "4.19",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "In this work we tackle the problem of out-of-distribution generalization through conditional computation. Real-world applications often exhibit a larger distributional shift between training and test data than most datasets used in research. On the other hand, training data in such applications often comes with additional annotation. We propose a method for leveraging this extra information by using an auxiliary network that modulates activations of the main network.  We show that this approach improves performance over a strong baseline on the Inria Aerial Image Labeling and the Tumor-Infiltrating Lymphocytes (TIL) Datasets, which by design evaluate out-of-distribution generalization in both semantic segmentation and image classification.",
+    "title": "Conditional Networks",
+    "authors": [
+      "Anthony Ortiz",
+      "Kris Sankaran",
+      "Olac Fuentes",
+      "Christopher Kiekintveld",
+      "Pascal Vincent",
+      "Yoshua Bengio",
+      "Doina Precup"
+    ],
+    "emails": [
+      "~Doina_Precup1",
+      "~Yoshua_Bengio1",
+      "~Anthony_Ortiz1",
+      "~Kris_Sankaran1",
+      "~Olac_Fuentes1",
+      "~Pascal_Vincent1",
+      "~Christopher_Kiekintveld2"
+    ],
+    "rank": 2533
+  },
+  {
+    "url": "https://openreview.net/forum?id=2pYMlvmsNaK",
+    "ratings": [
+      6,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "First-order stochastic optimization methods are currently the most widely used class of methods for training deep neural networks. However, the choice of the optimizer has become an ad-hoc rule that can significantly affect the performance. For instance, SGD with momentum (SGD+M) is typically used in computer vision (CV) and Adam is used for training transformer models for Natural Language Processing (NLP). Using the wrong method can lead to significant performance degradation. Inspired by the dual averaging algorithm, we propose Modernized Dual Averaging (MDA), an optimizer that is able to perform as well as SGD+M in CV and as Adam in NLP. Our method is not adaptive and is significantly simpler than Adam. We show that MDA induces a decaying uncentered L 2 -regularization compared to vanilla SGD+M and hypothesize that this may explain why it works on NLP problems where SGD+M fails.",
+    "title": "Dual Averaging is Surprisingly Effective for Deep Learning Optimization",
+    "authors": [
+      "Samy Jelassi",
+      "Aaron Defazio"
+    ],
+    "emails": [
+      "~Aaron_Defazio1",
+      "~Samy_Jelassi1"
+    ],
+    "rank": 2534
+  },
+  {
+    "url": "https://openreview.net/forum?id=UHGbeVORAAf",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Deep neural networks (DNNs) compute representations in a layer by layer fashion, producing a final representation at the top layer of the pipeline, and classification or regression is made using the final representation. A number of DNNs (e.g., ResNet, DenseNet) have shown that representations from the earlier layers can be beneficial. They improved performance by aggregating representations from different layers. In this work, we asked the question, besides forming an aggregation, whether these representations can be utilized directly with the classification layer(s) to obtain better performance. We started our quest to the answer by investigating the classifiers based on the representations from different layers and observed that these classifiers were diverse and many of their decisions were complementary to each other, hence having the potential to generate a better overall decision when combined. Following this observation, we propose an ensemble method that creates an ensemble of classifiers, each taking a representation from a different depth of a base DNN as the input. We tested this ensemble method in the setting of few-shot learning. Experiments were conducted on the mini-ImageNet and tieredImageNet datasets which are commonly used in the evaluation of few-shot learning methods. Our ensemble achieves the new state-of-the-art results for both datasets, comparing to previous regular and ensemble approaches.",
+    "title": "Multi-Representation Ensemble in Few-Shot Learning",
+    "authors": [
+      "Qing Chen",
+      "Jian Zhang"
+    ],
+    "emails": [
+      "~Jian_Zhang5",
+      "~Qing_Chen4"
+    ],
+    "rank": 2535
+  },
+  {
+    "url": "https://openreview.net/forum?id=trPMYEn1FCX",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      4,
+      3
+    ],
+    "rating": "4.19",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The task of predicting human motion is complicated by the natural heterogeneity and compositionality of actions, necessitating robustness to distributional shifts as far as out-of-distribution (OoD). Here we formulate a new OoD benchmark based on the Human3.6M and CMU motion capture datasets, and introduce a hy- brid framework for hardening discriminative architectures to OoD failure by aug- menting them with a generative model. When applied to current state-of-the-art discriminative models, we show that the proposed approach improves OoD ro- bustness without sacrificing in-distribution performance, and can theoretically facilitate model interpretability. We suggest human motion predictors ought to be constructed with OoD challenges in mind, and provide an extensible general framework for hard- ening diverse discriminative architectures to extreme distributional shift.",
+    "title": "GENERATIVE MODEL-ENHANCED HUMAN MOTION PREDICTION",
+    "authors": [
+      "Anthony Bourached",
+      "Ryan-Rhys Griffiths",
+      "Robert Gray",
+      "Ashwani Jha",
+      "Parashkev Nachev"
+    ],
+    "emails": [
+      "ucl.ac.uk",
+      "cam.ac.uk"
+    ],
+    "rank": 2536
+  },
+  {
+    "url": "https://openreview.net/forum?id=y2I4gyAGlCB",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.19",
+    "confidences": [
+      5,
+      3,
+      5,
+      3
+    ],
+    "abstract": "In this paper we explore the richness of information captured by the latent space of a vision-based generative model.  The model combines unsupervised generative learning with a task-based performance predictor to learn and to exploit task-relevant object affordances given visual observations from a reaching task, involving a scenario and a stick-like tool.  While the learned embedding of the generative model captures factors of variation in 3D tool geometry (e.g. length, width, and shape), the performance predictor identifies sub-manifolds of the embedding that correlate with task success. Within a variety of scenarios, we demonstrate that traversing the latent space via backpropagation from the performance predictor allows us to imagine tools appropriate for the task at hand.  Our results indicate that affordances \u2013 like the utility for reaching \u2013 are encoded along smooth trajectories in latent space. Accessing these emergent affordances by considering only high-level performance criteria (such as task success) enables an agent to manipulate tool geometries in a targeted and deliberate way.",
+    "title": "Imagine That! Leveraging Emergent Affordances for 3D Tool Synthesis",
+    "authors": [
+      "Yizhe Wu",
+      "Sudhanshu Kasewa",
+      "Oliver Groth",
+      "Sasha Salter",
+      "Kevin Li Sun",
+      "Oiwi Parker Jones",
+      "Ingmar Posner"
+    ],
+    "emails": [
+      "~Oliver_Groth1",
+      "~Oiwi_Parker_Jones1",
+      "~Yizhe_Wu1",
+      "~Ingmar_Posner1",
+      "~Sasha_Salter1",
+      "~Sudhanshu_Kasewa1",
+      "~Kevin_Li_Sun1"
+    ],
+    "rank": 2537
+  },
+  {
+    "url": "https://openreview.net/forum?id=ME1ugH3uXr",
+    "ratings": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Domain shift in finetuning from pre-training can significantly impact the performance of deep neural networks. In NLP, this led to domain specific models such as SciBERT, BioBERT, ClinicalBERT, and FinBERT; each pre-trained on a different, manually curated, domain-specific corpus. In this work, we present a novel domain-adaptation framework to tailor pre-training so as to reap the benefits of domain specific pre-training even if we do not have access to large domain specific pre-training corpus. The need for such a method is clear as it is infeasible to collect a large pre-training corpus for every possible domain. Our method is completely unsupervised and unlike related methods, works well in the setting where the target domain data is limited in size.  We draw a connection between the task of adapting a large corpus to a target domain and that of anomaly detection, resulting in a scalable and efficient domain adaptation framework.  We evaluate our framework and various baselines on eight tasks across four different domains: Biomedical, Computer Science, News, and Movie reviews. Our framework outperforms all the baseline methods and yields an average gain of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1.07</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> in performance. We also evaluate it on one of the GLUE task, sentiment analysis and achieve an improvement of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.4</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> in accuracy.",
+    "title": "Domain Adaptation via Anaomaly Detection",
+    "authors": [
+      "Vivek Madan",
+      "Ashish Khetan",
+      "Zohar Karnin"
+    ],
+    "emails": [
+      "~Ashish_Khetan1",
+      "~Vivek_Madan2",
+      "~Zohar_Karnin1"
+    ],
+    "rank": 2538
+  },
+  {
+    "url": "https://openreview.net/forum?id=IZIHJ-ME9c-",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "While machine learning excels at learning predictive models from observational data, learning the causal mechanisms behind the observed phenomena presents the significant challenge of distinguishing true causal relationships from confounding and other potential sources of spurious correlations. Many existing algorithms for the discovery of causal structure from observational data rely on evaluating the conditional independence relationships among features to account for the effects of confounding. However, the choice of independence tests for these algorithms often rely on assumptions regarding the data distributions and type of causal relationships. To avoid these assumptions, we develop a novel deep learning approach, dubbed the Hokey Pokey model, to indirectly explore the conditional dependencies among a set of variables by rapidly comparing predictive errors given different combinations of input variables. We then use the results of this comparison as a predictive signal for causal relationships among the variables. We conduct rigorous experiments to evaluate model robustness and generalizability using generated datasets with known underlying causal relationships and analyze the capacity of model error comparisons to provide a predictive signal for causal structure. Our model outperforms commonly used baseline models (PC and GES) and is capable of discovering causal relationships of different complexity (graph size, density and structure) in both binary and continuous data.",
+    "title": "Hokey Pokey Causal Discovery: Using Deep Learning Model Errors to Learn Causal Structure",
+    "authors": [
+      "Emily Saldanha",
+      "Dustin Arendt",
+      "Svitlana Volkova"
+    ],
+    "emails": [
+      "~Svitlana_Volkova1",
+      "~Emily_Saldanha1",
+      "pnnl.gov"
+    ],
+    "rank": 2539
+  },
+  {
+    "url": "https://openreview.net/forum?id=1IBgFQbj7y",
+    "decision": "No judgement (proceed with normal process)",
+    "ratings": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.19",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Categorical Cross Entropy (CCE) is the most commonly used loss function in deep neural networks such as Convolutional Neural Networks (CNNs) for multi-class classification problems. In spite of the fact that CCE is highly susceptible to noise; CNN models trained without accounting for the unique noise characteristics of the input data, or noise introduced during model training, invariably suffer from overfitting affecting model generalizability. The lack of generalizability becomes especially apparent in the context of ethnicity/racial image classification problems encountered in the domain of computer vision. One such problem is the unintended discriminatory racial bias that CNN models trained using CCE fail to adequately address. In other words, CNN models trained using CCE offer a skewed representation of classification performance favoring lighter skin tones.\n\nIn this paper, we propose and empirically validate a novel noise-robust extension to the existing CCE loss function called Maximum Categorical Cross-Entropy (MCCE), which utilizes CCE loss and a novel reconstruction loss, calculated using the Maximum Entropy (ME) measures of the convolutional kernel weights and input training dataset. We compare the use of MCCE with CCE-trained models on two benchmarking datasets, colorFERET and UTKFace, using a Residual Network (ResNet) CNN architecture. MCCE-trained models reduce overfitting by 5.85% and 4.3% on colorFERET and UTKFace datasets respectively. In cross-validation testing, MCCE-trained models outperform CCE-trained models by 8.8% and 25.16% on the colorFERET and UTKFace datasets respectively. MCCE addresses and mitigates the persistent problem of inadvertent racial bias for facial recognition problems in the domain of computer vision.",
+    "title": "Maximum Categorical Cross Entropy (MCCE): A noise-robust alternative loss function to mitigate racial bias in Convolutional Neural Networks (CNNs) by reducing overfitting",
+    "authors": [
+      "Nidhi Gowdra",
+      "Roopak Sinha",
+      "Stephen MacDonell",
+      "WeiQi Yan"
+    ],
+    "emails": [
+      "~Nidhi_Gowdra1",
+      "aut.ac.nz"
+    ],
+    "rank": 2540
+  },
+  {
+    "url": "https://openreview.net/forum?id=kJVVgJ-yCq",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Existing joint learning strategies like multi-task or meta-learning focus more on shared learning and have little to no scope for task-specific learning. This creates the need for a distinct shared pretraining phase and a task-specific finetuning phase. The fine-tuning phase creates separate systems for each task, where improving the performance of a particular task necessitates forgetting some of the knowledge garnered in other tasks. Humans, on the other hand, perform task-specific learning in synergy with general domain-based learning. Inspired by these learning patterns in humans, we suggest a simple yet generic task aware framework to incorporate into existing joint learning strategies. The proposed framework computes task-specific representations to modulate model parameters during joint learning. Hence, it performs both shared and task-specific learning in a single-phase resulting in a single model for all the tasks. The single model itself achieves significant performance gains over the existing joint learning strategies.  For example, we train a model on Speech Translation (ST), Automatic Speech Recognition (ASR), and Machine Translation (MT) tasks using the proposed task aware joint learning approach. This single model achieves a performance of 28.64 BLEU score on ST MuST-C English-German, WER of 11.61 on ASR TEDLium v3, and BLEU score of 23.35 on MT WMT14 English-German tasks. This sets a new state-of-the-art performance (SOTA) on the ST task while outperforming the existing end-to-end ASR systems with a competitive performance on the MT task. ",
+    "title": "Learning without Forgetting: Task Aware Multitask Learning for Multi-Modality Tasks",
+    "authors": [
+      "Sathish Reddy Indurthi",
+      "Mohd Abbas Zaidi",
+      "Nikhil Kumar Lakumarapu",
+      "Beomseok Lee",
+      "HyoJung Han",
+      "Sangha Kim",
+      "Inchul Hwang"
+    ],
+    "emails": [
+      "~Mohd_Abbas_Zaidi2",
+      "~Sathish_Reddy_Indurthi2",
+      "~Nikhil_Kumar_Lakumarapu1",
+      "~HyoJung_Han1",
+      "samsung.com"
+    ],
+    "rank": 2541
+  },
+  {
+    "url": "https://openreview.net/forum?id=7ehDLD1yoE0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.19",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Adversarial examples are imperceptible perturbations in the input to a neural model that result in misclassification. Generating adversarial examples for source code poses an additional challenge compared to the domains of images and natural language, because source code perturbations must adhere to strict semantic guidelines so the resulting programs retain the functional meaning of the code. We propose a simple and efficient gradient-free method for generating state-of-the-art adversarial examples on models of code that can be applied in a white-box or black-box setting. Our method generates untargeted and targeted attacks, and empirically outperforms competing gradient-based methods with less information and less computational effort.",
+    "title": "STRATA: Simple, Gradient-free Attacks for Models of Code",
+    "authors": [
+      "Jacob M. Springer",
+      "Bryn Marie Reinstadler",
+      "Una-May O'Reilly"
+    ],
+    "emails": [
+      "~Una-May_O'Reilly1",
+      "~Bryn_Marie_Reinstadler1",
+      "~Jacob_M._Springer1"
+    ],
+    "rank": 2542
+  },
+  {
+    "url": "https://openreview.net/forum?id=nhIsVl2UoMt",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      6
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "We present the Additive Poisson Process (APP), a novel framework that can model the higher-order interaction effects of the intensity functions in stochastic processes using lower dimensional projections. Our model combines the techniques in information geometry to model higher-order interactions on a statistical manifold and in generalized additive models to use lower-dimensional projections to overcome the effects from the curse of dimensionality. Our approach solves a convex optimization problem by minimizing the KL divergence from a sample distribution in lower dimensional projections to the distribution modeled by an intensity function in the stochastic process. Our empirical results show that our model is able to use samples observed in the lower dimensional space to estimate the higher-order intensity function with extremely sparse observations.",
+    "title": "Additive Poisson Process: Learning Intensity of Higher-Order Interaction in Stochastic Processes",
+    "authors": [
+      "Simon Luo",
+      "Feng Zhou",
+      "Lamiae Azizi",
+      "Mahito Sugiyama"
+    ],
+    "emails": [
+      "~Feng_Zhou9",
+      "~Simon_Luo1",
+      "~Mahito_Sugiyama1",
+      "sydney.edu.au"
+    ],
+    "rank": 2543
+  },
+  {
+    "url": "https://openreview.net/forum?id=XSVrZvmwVR9",
+    "ratings": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.18",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Meta-learning enables models to adapt to new environments rapidly with a few training examples. Current gradient-based meta-learning methods concentrate on finding good initialization (meta-weights) for learners, but ignore the impact of neural architectures. In this paper, we aim to obtain better meta-learners by co-optimizing the architecture and meta-weights simultaneously. Existing NAS-based methods apply a two-stage strategy,i.e., first searching architectures and then re-training meta-weights for the searched architecture. However, this two-stage strategy would lead to a suboptimal meta-learner, since the meta-weights are overlooked during searching architectures for meta-learning. Differently, we propose a more efficient and effective method for meta-learning, namely Connection-Adaptive Meta-learning (CAML), which jointly searches architectures and train the meta-weights on consolidated connections. During searching, we consolidate the architecture connections layer by layer, in which the layer with the largest weight value would be fixed first. With searching only once, our CAML is able to obtain both adaptive architecture and meta-weights fo meta-learning. Extensive experiments show that CAML achieves state-of-the-art performance with 130x less computational cost, revealing our method\u2019s effectiveness and efficiency.",
+    "title": "Connection-Adaptive Meta-Learning",
+    "authors": [
+      "Yadong Ding",
+      "Yu Wu",
+      "Chengyue Huang",
+      "Siliang Tang",
+      "Yi Yang",
+      "Yueting Zhuang"
+    ],
+    "emails": [
+      "~Yueting_Zhuang1",
+      "~Yu_Wu3",
+      "~Yi_Yang4",
+      "~Siliang_Tang1",
+      "~Chengyue_Huang1",
+      "~Yadong_Ding1"
+    ],
+    "rank": 2544
+  },
+  {
+    "url": "https://openreview.net/forum?id=PhV-qfEi3Mr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Crossbar-enabled analog computing-in-memory (CACIM) systems can significantly improve the computation speed and energy efficiency of deep neural networks (DNNs). However, the transition of DNN from the digital systems to CACIM systems usually reduces its accuracy. The major issue is that the weights of DNN are stored and calculated directly on analog quantities in CACIM systems. The variation and programming overhead of the analog weight limit the precision.\nTherefore, a suitable quantization algorithm is important when deploying a DNN into CACIM systems to obtain less accuracy loss. The analog weight has its unique advantages when doing quantization. Because there is no encoding and decoding process, the set of quanta will not affect the computing process. Therefore, a generalized quantization method that does not constrain the range of quanta and can obtain less quantization error will be effective in CACIM systems. For the first time, we introduced a generalized quantization method into CACIM systems and showed superior performance on a series of computer vision tasks, such as image classification, object detection, and semantic segmentation. Using the generalized quantization method, the DNN with 8-level analog weights can outperform the 32-bit networks. With fewer levels, the generalized quantization method can obtain less accuracy loss than other uniform quantization methods.",
+    "title": "Improving the accuracy of neural networks in analog computing-in-memory systems by a generalized quantization method",
+    "authors": [
+      "Lingjun Dai",
+      "Qingtian Zhang",
+      "Huaqiang Wu"
+    ],
+    "emails": [
+      "~Huaqiang_Wu1",
+      "~Qingtian_Zhang1",
+      "~Lingjun_Dai1"
+    ],
+    "rank": 2545
+  },
+  {
+    "url": "https://openreview.net/forum?id=KTEde38blNB",
+    "decision": "Reject",
+    "ratings": [
+      7,
+      2,
+      6,
+      3
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "In this paper we propose a novel approach for stabilizing the training process of Generative Adversarial Networks as well as alleviating the mode collapse problem. The main idea is to incorporate a regularization term that we call intervention into the objective. We refer to the resulting generative model as Intervention Generative Adversarial Networks (IVGAN). By perturbing the latent representations of real images obtained from an auxiliary encoder network with Gaussian invariant interventions and penalizing the dissimilarity of the distributions of the resulting generated images, the intervention term provides more informative gradient for the generator, significantly improving training stability and encouraging modecovering behaviour. We demonstrate the performance of our approach via solid theoretical analysis and thorough evaluation on standard real-world datasets as well as the stacked MNIST dataset.\n",
+    "title": "Intervention Generative Adversarial Nets",
+    "authors": [
+      "Jiadong Liang",
+      "Liangyu Zhang",
+      "Cheng Zhang",
+      "Zhihua Zhang"
+    ],
+    "emails": [
+      "~Liangyu_Zhang2",
+      "~Jiadong_Liang1",
+      "~Cheng_Zhang3",
+      "~Zhihua_Zhang1"
+    ],
+    "rank": 2546
+  },
+  {
+    "url": "https://openreview.net/forum?id=6gZJ6f6pU6h",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.18",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Given multiple source datasets with labels, how can we train a target model with no labeled data? Multi-source domain adaptation (MSDA) aims to train a model using multiple source datasets different from a target dataset in the absence of target data labels. MSDA is a crucial problem applicable to many practical cases where labels for the target data are unavailable due to privacy issues. Existing MSDA frameworks are limited since they align data without considering conditional distributions p(x|y) of each domain. They also do not fully utilize the target data without labels, and rely on limited feature extraction with a single extractor. In this paper, we propose Multi-EPL, a novel method for multi-source domain adaptation. Multi-EPL exploits label-wise moment matching to align conditional distributions p(x|y), uses pseudolabels for the unavailable target labels, and introduces an ensemble of multiple feature extractors for accurate domain adaptation. Extensive experiments show that Multi-EPL provides the state-of-the-art performance for multi-source domain adaptation tasks in both of image domains and text domains.",
+    "title": "Multi-EPL: Accurate Multi-source Domain Adaptation",
+    "authors": [
+      "Seongmin Lee",
+      "Hyunsik Jeon",
+      "U Kang"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Seongmin_Lee2",
+      "~U_Kang1"
+    ],
+    "rank": 2547
+  },
+  {
+    "url": "https://openreview.net/forum?id=FMdjYY6H8-Z",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5
+    ],
+    "rating": "4.18",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "The matrix local low rank representation (MLLRR) is a critical dimension reduction technique widely used in recommendation systems, text mining and computer vision. In MLLRR, how to robustly identify the row and column indices that forma distinct low rank sub-matrix is a major challenge. In this work, we first organized the general MLLRR problem into three inter-connected sub-problems based on different low rank properties, namely, LLR-1C, LLR-1, and LLR-r.  Existing solutions on MLLRR all leverage problem-specific assumptions and mainly focused on the LLR-1C problem, which lacks the capacity to detect a substantial amount of true and interesting patterns generalizability and prohibits. In this work, we developed a novel multiple-filter based neural network framework, namely FLLRM, which is the first of its kind to solve all three MLLRR problems.We systematically benchmarked FLLRM with state-of-the-art methods on an extensive set of synthetic data, empowered by a robustness evaluation of parameters and theoretical discussions.  Experimental results showed that FLLRM outperforms all existing methods and enables a general solution to all the three sub-problems. Experiments on real-world datasets also validated the effectiveness of FLLRM on identifying local low rank matrices corresponding to novel context specific knowledge.",
+    "title": "RETHINKING LOCAL LOW RANK MATRIX DETECTION:A MULTIPLE-FILTER BASED NEURAL NETWORK FRAMEWORK",
+    "authors": [
+      "Pengtao Dang",
+      "Wennan Chang",
+      "Haiqi Zhu",
+      "Changlin Wan",
+      "Tong Zhao",
+      "Tingbo Guo",
+      "Paul Salama",
+      "Sha Cao",
+      "Chi Zhang"
+    ],
+    "emails": [
+      "purdue.edu",
+      "iu.edu",
+      "~Changlin_Wan1",
+      "foxmail.com",
+      "amazon.com",
+      "~Pengtao_Dang1",
+      "gmail.com",
+      "~Paul_Salama1"
+    ],
+    "rank": 2548
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZD7Ll4pAw7C",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Channel pruning is a popular technique for compressing convolutional neural networks (CNNs), and various pruning criteria have been proposed to remove the redundant filters of CNNs. From our comprehensive experiments, we find some blind spots on pruning criteria: (1) Similarity: There are some strong similarities among several primary pruning criteria that are widely cited and compared. According to these criteria, the ranks of filters\u2019 importance in a convolutional layer are almost the same, resulting in similar pruned structures. (2) Applicability: For a large network (each convolutional layer has a large number of filters), some criteria can not distinguish the network redundancy well from their measured filters' importance. In this paper, we theoretically validate these two findings with our assumption that the well-trained convolutional filters in each layer approximately follow a Gaussian-alike distribution. This assumption is verified through systematic and extensive statistical tests. ",
+    "title": "Rethinking the Pruning Criteria for Convolutional Neural Network",
+    "authors": [
+      "Zhongzhan Huang",
+      "Xinjiang Wang",
+      "Ping Luo"
+    ],
+    "emails": [
+      "~Xinjiang_Wang1",
+      "~Zhongzhan_Huang1",
+      "~Ping_Luo2"
+    ],
+    "rank": 2549
+  },
+  {
+    "url": "https://openreview.net/forum?id=hLr7JzDFQBn",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      5,
+      5,
+      3
+    ],
+    "abstract": "We need intelligent robots to perform mobile construction, the process of moving in an environment and modifying its geometry according to a design plan. Without GPS or similar techniques, carefully engineered and learning-based methods face challenges to exactly achieve the plan due to the difficulty of accurately localizing the robot while strategically evolving the environment, because common tasks (manipulation/navigation) address at most one of the two coupled aspects. To seek a generic solution, we simplify mobile construction in 1/2/3D grid worlds to benchmark the performance of existing deep RL methods on this partially observable MDP. Our results show that the coupling makes this problem very challenging for model-free RL, and emphasize the need for novel task-specific solutions.",
+    "title": "Mobile Construction Benchmark",
+    "authors": [
+      "Wenyu Han",
+      "Chen Feng",
+      "Haoran Wu",
+      "Alexander Gao",
+      "Armand Jordana",
+      "Dongdong Liu",
+      "Lerrel Pinto",
+      "Ludovic Righetti"
+    ],
+    "emails": [
+      "~Ludovic_Righetti1",
+      "~Lerrel_Pinto1",
+      "~Dongdong_Liu1",
+      "nyu.edu",
+      "~Chen_Feng2"
+    ],
+    "rank": 2550
+  },
+  {
+    "url": "https://openreview.net/forum?id=1-j4VLSHApJ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3
+    ],
+    "rating": "4.18",
+    "confidences": [
+      3,
+      5,
+      3
+    ],
+    "abstract": "Recent work in black-box adversarial attacks for NLP systems has attracted attention. Prior black-box attacks assume that attackers can observe output labels from target models based on selected inputs. In this work, inspired by  adversarial transferability, we propose a new type of black-box NLP adversarial attack that an attacker can choose a similar domain and transfer the adversarial examples to the target domain and cause poor performance in target model. Based on domain adaptation theory, we then propose a defensive strategy, called Learn2Weight, which trains to predict the weight adjustments for target model in order to defense the attack of similar-domain adversarial examples. Using Amazon multi-domain sentiment classification dataset, we empirically show that Learn2Weight model is effective against the attack compared to standard black-box defense methods such as adversarial training and defense distillation. This work contributes to the growing literature on machine learning safety.",
+    "title": "Learn2Weight: Weights Transfer Defense against Similar-domain Adversarial Attacks",
+    "authors": [
+      "Siddhartha Datta"
+    ],
+    "emails": [
+      "~Siddhartha_Datta1"
+    ],
+    "rank": 2551
+  },
+  {
+    "url": "https://openreview.net/forum?id=TTnPcO6kK5",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Stochastic momentum optimization methods, also known as stochastic heavy ball (SHB) methods, are one of the most popular optimization methods for deep learning. These methods can help accelerate stochastic gradient descent and dampen oscillations. In this paper we provide a new variant of the stochastic heavy ball method, called stochastic Euler\u2019s heavy ball (SEHB). The proposed SEHB method modifies the steepest descent direction to achieve acceleration, and combines Euler\u2018s method to adaptively adjust learning rates as well. A convergence analysis of the regret bound is discussed under the online convex optimization framework. Furthermore, we conduct experiments on various popular datasets and deep learning models. Empirical results demonstrate that our SEHB method shows comparable or even better generalization performance than state-of-the-art optimization methods such as SGD and Adam. ",
+    "title": "A New Variant of Stochastic Heavy ball Optimization Method for Deep Learning",
+    "authors": [
+      "Zhou Shao",
+      "Tong Lin"
+    ],
+    "emails": [
+      "~Tong_Lin1",
+      "~Zhou_Shao1"
+    ],
+    "rank": 2552
+  },
+  {
+    "url": "https://openreview.net/forum?id=oxRaiMDSzwr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      7
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "A key challenge in analyzing neural networks' robustness is identifying input features for which networks are robust to perturbations. Existing work focuses on direct perturbations to the inputs, thereby studies network robustness to the lowest-level features. In this work, we take a new approach and study the robustness of networks to the inputs' semantic features. We show a black-box approach to determine features for which a network is robust or weak.  We leverage these features to obtain provably robust neighborhoods defined using robust features and adversarial examples defined by perturbing weak features. We evaluate our approach with PCA features. We show (1) provably robust neighborhoods are larger: on average by 1.8x and up to 4.5x, compared to the standard neighborhoods, and (2) our adversarial examples are generated using at least 8.7x fewer queries and have at least 2.8x lower L2 distortion compared to state-of-the-art. We further show that our attack is effective even against ensemble adversarial training. ",
+    "title": "NETWORK ROBUSTNESS TO PCA PERTURBATIONS",
+    "authors": [
+      "Anan Kabaha",
+      "Dana Drachsler Cohen"
+    ],
+    "emails": [
+      "~Anan_Kabaha3",
+      "~Dana_Drachsler_Cohen1"
+    ],
+    "rank": 2553
+  },
+  {
+    "url": "https://openreview.net/forum?id=SP5RHi-rdlJ",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.18",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Quantized neural networks are gaining popularity thanks to their ability to solve complex tasks with comparable accuracy as full-precision Deep Neural Networks (DNNs), while also reducing computational power and storage requirements and increasing the processing speed. These properties make them an attractive alternative for the development and deployment of DNN-based applications in Internet-Of-Things (IoT) devices. Among quantized networks, Binary Neural Networks (BNNs) have reported the largest speed-up. However, they suffer from a fixed and limited compression factor that may result insufficient for certain devices with very limited resources. In this work, we propose Sparse Binary Neural Networks, a novel model and training scheme that allows to introduce sparsity in BNNs by using positive 0/1 binary weights, instead of the -1/+1 weights used by state-of-the-art binary networks. As a result, our method is able to achieve a high compression factor and reduces the number of operations and parameters at inference time. We study the properties of our method through experiments on linear and convolutional networks over MNIST and CIFAR-10 datasets. Experiments confirm that SBNNs can achieve high compression rates and good generalization, while further reducing the operations of BNNs, making it a viable option for deploying DNNs in very cheap and low-cost IoT devices and sensors.",
+    "title": "Sparse Binary Neural Networks",
+    "authors": [
+      "Riccardo Schiavone",
+      "Maria A Zuluaga"
+    ],
+    "emails": [
+      "~Riccardo_Schiavone1",
+      "~Maria_A_Zuluaga1"
+    ],
+    "rank": 2554
+  },
+  {
+    "url": "https://openreview.net/forum?id=sfy1DGc54-M",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      4
+    ],
+    "rating": "4.18",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Despite the remarkable success of deep neural networks, significant concerns have emerged about their robustness to adversarial perturbations to inputs. While most attacks aim to ensure that these are imperceptible, physical perturbation attacks typically aim for being unsuspicious, even if perceptible. However, there is no universal notion of what it means for adversarial examples to be unsuspicious. We propose an approach for modeling suspiciousness by leveraging cognitive salience. Specifically, we split an image into foreground (salient region) and background (the rest), and allow significantly larger adversarial perturbations in the background, while ensuring that cognitive salience of background remains low. We describe how to compute the resulting non-salience-preserving dual-perturbation attacks on classifiers. We then experimentally demonstrate that our attacks indeed do not significantly change perceptual salience of the background, but are highly effective against classifiers robust to conventional attacks. Furthermore, we show that adversarial training with dual-perturbation attacks yields classifiers that are more robust to these than state-of-the-art robust learning approaches, and comparable in terms of robustness to conventional attacks.",
+    "title": "Towards Robustness against Unsuspicious Adversarial Examples",
+    "authors": [
+      "Liang Tong",
+      "Minzhe Guo",
+      "Atul Prakash",
+      "Yevgeniy Vorobeychik"
+    ],
+    "emails": [
+      "wustl.edu",
+      "~Yevgeniy_Vorobeychik1",
+      "~Atul_Prakash1",
+      "~Liang_Tong1"
+    ],
+    "rank": 2555
+  },
+  {
+    "url": "https://openreview.net/forum?id=rLj5jTcCUpp",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.17",
+    "confidences": [
+      3,
+      3,
+      4,
+      2
+    ],
+    "abstract": "We propose Distribution Embedding Network (DEN) for meta-learning, which is designed for applications where both the distribution and the number of features could vary across tasks. DEN first transforms features using a learned piecewise linear function, then learns an embedding of the underlying data distribution after the transformation, and finally classifies examples based on the distribution embedding. We show that the parameters of the distribution embedding and the classification modules can be shared across tasks. We propose a novel methodology to mass-simulate binary classification training tasks, and demonstrate that DEN outperforms existing methods in a number of test tasks in numerical studies.",
+    "title": "Distribution Embedding Network for Meta-Learning with Variable-Length Input",
+    "authors": [
+      "Lang Liu",
+      "Mahdi Milani Fard",
+      "Sen Zhao"
+    ],
+    "emails": [
+      "~Lang_Liu1",
+      "~Sen_Zhao1",
+      "~Mahdi_Milani_Fard1"
+    ],
+    "rank": 2556
+  },
+  {
+    "url": "https://openreview.net/forum?id=TPFhIknddKX",
+    "ratings": [
+      5,
+      6,
+      3,
+      3
+    ],
+    "rating": "4.17",
+    "confidences": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "abstract": "We re-examine Routing Networks, an approach to multi-task learning that uses reinforcement learning to decide parameter sharing with the goal of maximizing knowledge transfer between related tasks while avoiding task interference. These benefits come with the cost of solving a more difficult optimization problem. We argue that the success of this model depends on a few key assumptions and, when they are not satisfied, the difficulty of learning a good route can outweigh the benefits of the approach. In these cases, a simple unlearned routing strategy, which we propose, achieves the best results.",
+    "title": "Re-examining Routing Networks for Multi-task Learning",
+    "authors": [
+      "Limeng Cui",
+      "Aaron Jaech"
+    ],
+    "emails": [
+      "fb.com",
+      "~Limeng_Cui1"
+    ],
+    "rank": 2557
+  },
+  {
+    "url": "https://openreview.net/forum?id=4kWGWoFGA_H",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      3,
+      4
+    ],
+    "rating": "4.17",
+    "confidences": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "abstract": "We investigate the robustness of video machine learning models to bit-level network and file corruptions, which can arise from network transmission failures or hardware errors, and explore defenses against such corruptions. We simulate network and file corruptions at multiple corruption levels, and find that bit-level corruptions can cause substantial performance drops on common action recognition and multi-object tracking tasks. We explore two types of defenses against bit-level corruptions: corruption-agnostic and corruption-aware defenses. We find that corruption-agnostic defenses such as adversarial training have limited effectiveness, performing up to 11.3 accuracy points worse than a no-defense baseline. In response, we propose Bit-corruption Augmented Training (BAT), a corruption-aware baseline that exploits knowledge of bit-level corruptions to enforce model invariance to such corruptions. BAT outperforms corruption-agnostic defenses, recovering up to 7.1 accuracy points over a no-defense baseline on highly-corrupted videos while maintaining competitive performance on clean/near-clean data.",
+    "title": "Beyond the Pixels: Exploring the Effects of Bit-Level Network and File Corruptions on Video Model Robustness",
+    "authors": [
+      "Trenton Chang",
+      "Daniel Yang Fu",
+      "Yixuan Li"
+    ],
+    "emails": [
+      "~Trenton_Chang1",
+      "~Daniel_Yang_Fu1",
+      "~Yixuan_Li1"
+    ],
+    "rank": 2558
+  },
+  {
+    "url": "https://openreview.net/forum?id=MU0yqXIoleL",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.17",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "In this paper, we study the problem of learning continuous vector representations of knowledge graphs for predicting missing links. We present a new approach called ConEx, which infers missing links by leveraging the composition of a 2D convolution with a Hermitian inner product of complex-valued embedding vectors. We evaluate ConEx against state-of-the-art approaches on the WN18RR, FB15K-237, KINSHIP and UMLS benchmark datasets. Our experimental results show that ConEx achieves a  performance superior to that of state-of-the-art approaches such as RotatE, QuatE and TuckER in the link prediction task on all datasets while requiring at least 8 times fewer parameters. We ensure the reproducibility of our results by providing an open-source implementation which includes the training, evaluation scripts along with pre-trained models at <a href=\"https://github.com/conex-kge/ConEx\" target=\"_blank\" rel=\"nofollow\">https://github.com/conex-kge/ConEx</a>.",
+    "title": "Convolutional Complex Knowledge Graph Embeddings",
+    "authors": [
+      "Caglar Demir",
+      "Axel Ngonga"
+    ],
+    "emails": [
+      "~Caglar_Demir1",
+      "~Axel_Ngonga1"
+    ],
+    "rank": 2559
+  },
+  {
+    "url": "https://openreview.net/forum?id=cq4FHzAz9eA",
+    "ratings": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "rating": "4.17",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "The success of gragh neural networks (GNNs) in the past years has aroused grow-ing interest and effort in designing best models to handle graph-structured data. Asthe neural architecture search technique has been witnessed to rival against humanexperts in discovering performant network topologies, recently, it has been appliedto the field of graphic network engineering. However, such works on graphic NASso far are purely software design and not considering hardware constraints at all.To address this problem,  we propose the first SW-HW codesign framework forautomating the deployment of GNNs.  Using FPGA as the target platform,  ourframework is able to performs the FPGA-aware graph neural architecture search(FGNAS). To evaluate our design, we experiment on benckmark datasets, namelyCora,  CiteCeer,  and PubMed,  and the results show FGNAS has better capabil-ity in optimizing the accuracy of GNNs when their hardware implementation isspecifically constrained.",
+    "title": "FGNAS: FPGA-Aware Graph Neural Architecture Search",
+    "authors": [
+      "Qing Lu",
+      "Weiwen Jiang",
+      "Meng Jiang",
+      "Jingtong Hu",
+      "Sakyasingha Dasgupta",
+      "Yiyu Shi"
+    ],
+    "emails": [
+      "~Sakyasingha_Dasgupta1",
+      "~Meng_Jiang3",
+      "~Weiwen_Jiang1",
+      "~Qing_Lu2",
+      "~Jingtong_Hu1",
+      "~Yiyu_Shi1"
+    ],
+    "rank": 2560
+  },
+  {
+    "url": "https://openreview.net/forum?id=naSAkn2Xo46",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5
+    ],
+    "rating": "4.17",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "Very large action spaces constitute a critical challenge for deep Reinforcement Learning (RL) algorithms. An existing approach consists in splitting the action space into smaller components and choosing either independently or sequentially actions in each dimension. This approach led to astonishing results for the StarCraft and Dota 2 games, however it remains underexploited and understudied. In this paper, we name this approach Factored Actions Reinforcement Learning (FARL) and study both its theoretical impact and practical use. Notably, we provide a theoretical analysis of FARL on the Proximal Policy Optimization (PPO) and Soft Actor Critic (SAC) algorithms and evaluate these agents in different classes of problems. We show that FARL is a very versatile and efficient approach to combinatorial and continuous control problems.",
+    "title": "Factored Action Spaces in Deep Reinforcement Learning",
+    "authors": [
+      "Thomas PIERROT",
+      "Valentin Mac\u00e9",
+      "Jean-Baptiste Sevestre",
+      "Louis Monier",
+      "Alexandre Laterre",
+      "Nicolas Perrin",
+      "Karim Beguir",
+      "Olivier Sigaud"
+    ],
+    "emails": [
+      "~Alexandre_Laterre1",
+      "~Nicolas_Perrin1",
+      "instadeep.com",
+      "~Thomas_PIERROT1",
+      "~Olivier_Sigaud1",
+      "~Valentin_Mac\u00e91"
+    ],
+    "rank": 2561
+  },
+  {
+    "url": "https://openreview.net/forum?id=F438zjb-XaM",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5
+    ],
+    "rating": "4.17",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Building effective neural machine translation (NMT) models for very low-resourced and morphologically rich African indigenous languages is an open challenge. Besides the issue of finding available resources for them, a lot of work is put into preprocessing and tokenization. Recent studies have shown that standard tokenization methods do not always adequately deal with the grammatical, diacritical, and tonal properties of some African languages. That, coupled with the extremely low availability of training samples, hinders the production of reliable NMT models. In this paper, using Fon language as a case study, we revisit standard tokenization methods and introduce Word-Expressions-Based (WEB) tokenization, a human-involved super-words tokenization strategy to create a better representative vocabulary for training.",
+    "title": "Crowd-sourced Phrase-Based Tokenization for Low-Resourced Neural Machine Translation: The case of Fon Language",
+    "authors": [
+      "Bonaventure F. P. Dossou",
+      "Chris Chinenye Emezue"
+    ],
+    "emails": [
+      "~Chris_Chinenye_Emezue1",
+      "~Bonaventure_F._P._Dossou1"
+    ],
+    "rank": 2562
+  },
+  {
+    "url": "https://openreview.net/forum?id=bfTUfrqL6d",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5
+    ],
+    "rating": "4.17",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Aspect-based sentiment classification aims to predict sentimental polarities of one or multiple aspects in texts. As texts always contain a large proportion of task-irrelevant words, accurate alignment between aspects and their sentimental descriptions is the most crucial and challenging step. State-of-the-art approaches are mainly based on word-level attention learned from recurrent neural network variants (e.g., LSTM) or graph neural networks. From another view, these methods essentially weight and aggregate all possible alignments. However, this mechanism heavily relies on large-scale supervision training: without enough labels, it could easily overfit with difficulty in generalization. To address this challenge, we propose SentRL, a reinforcement learning-based framework for aspect-based sentiment classification. In this framework, input texts are transformed into their dependency graphs. Then, an agent is deployed to walk on the graphs, explores paths from target aspect nodes to their potential sentimental regions, and differentiates the effectiveness of different paths. By limiting the agent's exploration budget, our method encourages the agent to skip task-irrelevant information and focus on the most effective paths for alignment purpose. Our method considerably reduces the impact of task-irrelevant words and improves generalization performance. Compared with competitive baseline methods, our approach achieves the highest performance on public benchmark datasets with up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c37\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3.7</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> improvement. ",
+    "title": "Aspect-based Sentiment Classification via Reinforcement Learning",
+    "authors": [
+      "Lichen Wang",
+      "Bo Zong",
+      "Yunyu Liu",
+      "Can Qin",
+      "Wei Cheng",
+      "Wenchao Yu",
+      "Xuchao Zhang",
+      "Haifeng Chen",
+      "Yun Fu"
+    ],
+    "emails": [
+      "~Yunyu_Liu1",
+      "~Wenchao_Yu1",
+      "nec-labs.com",
+      "~Bo_Zong1",
+      "~Haifeng_Chen1",
+      "~Yun_Fu1",
+      "~Lichen_Wang1",
+      "~Can_Qin1",
+      "~Wei_Cheng1"
+    ],
+    "rank": 2563
+  },
+  {
+    "url": "https://openreview.net/forum?id=-J9xYzP2HD",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      6,
+      6
+    ],
+    "rating": "4.16",
+    "confidences": [
+      4,
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Parametric models, and particularly neural networks, require weight initialization as a starting point for gradient-based optimization. Recent work shows that an initial parameter set can be learned from a population of supervised learning tasks that enables a fast convergence for unseen tasks even when only a handful of instances is available (model-agnostic meta-learning). \nCurrently, methods for learning model initializations are limited to a population of tasks sharing the same schema, i.e., the same number, order, type, and semantics of predictor and target variables.\nIn this paper, we address the problem of meta-learning weight initialization across tasks with different schemas, for example, if the number of predictors varies across tasks, while they still share some variables. We propose Chameleon, a model that learns to align different predictor schemas to a common representation. \nIn experiments on 23 datasets of the OpenML-CC18 benchmark, we show that Chameleon can successfully learn parameter initializations across tasks with different schemas, presenting, to the best of our knowledge, the first cross-dataset few-shot classification approach for unstructured data.",
+    "title": "Chameleon: Learning Model Initializations Across Tasks With Different Schemas",
+    "authors": [
+      "Lukas Brinkmeyer",
+      "Rafael Rego Drumond",
+      "Randolf Scholz",
+      "Josif Grabocka",
+      "Lars Schmidt-Thieme"
+    ],
+    "emails": [
+      "ismll.uni-hildesheim.de",
+      "~Josif_Grabocka1",
+      "~Lars_Schmidt-Thieme1",
+      "~Lukas_Brinkmeyer1"
+    ],
+    "rank": 2564
+  },
+  {
+    "url": "https://openreview.net/forum?id=kHromd7SNA",
+    "ratings": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.15",
+    "confidences": [
+      5,
+      3,
+      3,
+      2
+    ],
+    "abstract": "We propose novel motion representations for animating articulated objects consisting of distinct parts. In a completely unsupervised manner, our method identifies meaningful object parts, tracks them in a driving video, and infers their motions by considering their principal axes. In contrast to the previous keypoint-based works, our method extracts meaningful and consistent regions, describing locations, shape, and pose. The regions correspond to semantically relevant and distinct object parts, that are more easily detected in frames of the driving video. To force decoupling of foreground from background, we model non-object related global motion with a homography. Our model can animate a variety of objects, surpassing previous methods by a large margin on existing benchmarks. We present a challenging new benchmark with high-resolution videos and show that the improvement is particularly pronounced when articulated objects are considered. ",
+    "title": "Motion Representations for Articulated Animation",
+    "authors": [
+      "Aliaksandr Siarohin",
+      "Oliver J. Woodford",
+      "Jian Ren",
+      "Menglei Chai",
+      "Sergey Tulyakov"
+    ],
+    "emails": [
+      "~Jian_Ren2",
+      "~Menglei_Chai1",
+      "~Sergey_Tulyakov1",
+      "~Oliver_J._Woodford1",
+      "~Aliaksandr_Siarohin1"
+    ],
+    "rank": 2565
+  },
+  {
+    "url": "https://openreview.net/forum?id=0i0IjXuq6J5",
+    "ratings": [
+      5,
+      4,
+      3,
+      6
+    ],
+    "rating": "4.15",
+    "confidences": [
+      3,
+      3,
+      5,
+      2
+    ],
+    "abstract": "Contrastive representation learning has been recently proved to be very efficient for self-supervised training. These methods have been successfully used to train encoders which perform comparably to supervised training on downstream classification tasks. \nA few works have started to build a theoretical framework around contrastive learning in which guarantees for its performance can be proven. We provide extensions of these results to training with multiple negative samples and for multiway classification. \nFurthermore, we provide convergence guarantees for the minimization of the contrastive training error with gradient descent of an overparametrized deep neural encoder, and provide some numerical experiments that complement our theoretical findings.",
+    "title": "About contrastive unsupervised representation learning for classification and its convergence",
+    "authors": [
+      "Ibrahim Merad",
+      "Yiyang Yu",
+      "Emmanuel Bacry",
+      "St\u00e9phane Ga\u00efffas"
+    ],
+    "emails": [
+      "~Yiyang_Yu1",
+      "~St\u00e9phane_Ga\u00efffas1",
+      "~Ibrahim_Merad1",
+      "~Emmanuel_Bacry1"
+    ],
+    "rank": 2566
+  },
+  {
+    "url": "https://openreview.net/forum?id=mxfRhLgLg_",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      7,
+      3
+    ],
+    "rating": "4.15",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We introduce an efficient approximation to the loss function for the ecological inference problem, where individual labels are predicted from aggregates. This allows us to construct ecological versions of linear models, deep neural networks, and Bayesian neural networks. Using these models we infer probabilities of vote choice for candidates in the Maryland 2018 midterm elections for 2,322,277 voters in 2055 precincts. We show that increased network depth and joint learning of multiple races within an election improves the accuracy of ecological inference when compared to benchmark data from polling. Additionally we leverage data on the joint distribution of ballots (available from ballot images which are public for election administration purposes) to show that joint learning leads to significantly improved recovery of the covariance structure for multi-task ecological inference. Our approach also allows learning latent representations of voters, which we show outperform raw covariates for leave-one-out prediction. ",
+    "title": "Deep Ecological Inference",
+    "authors": [
+      "Nic Fishman",
+      "Colin McAuliffe"
+    ],
+    "emails": [
+      "dataforprogress.org",
+      "~Nic_Fishman1"
+    ],
+    "rank": 2567
+  },
+  {
+    "url": "https://openreview.net/forum?id=4ADnf1HqIw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.15",
+    "confidences": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Regularization is used to avoid overfitting when training a neural network; unfortunately, this reduces the attainable level of detail hindering the ability to capture high-frequency information present in the training data. Even though various approaches may be used to re-introduce high-frequency detail, it typically does not match the training data and is often not time coherent. In the case of network inferred cloth, these sentiments manifest themselves via either a lack of detailed wrinkles or unnaturally appearing and/or time incoherent surrogate wrinkles. Thus, we propose a general strategy whereby high-frequency information is procedurally embedded into low-frequency data so that when the latter is smeared out by the network the former still retains its high-frequency detail. We illustrate this approach by learning texture coordinates which when smeared do not in turn smear out the high-frequency detail in the texture itself but merely smoothly distort it. Notably, we prescribe perturbed texture coordinates that are subsequently used to correct the over-smoothed appearance of inferred cloth, and correcting the appearance from multiple camera views naturally recovers lost geometric information.",
+    "title": "Recovering Geometric Information with Learned Texture Perturbations",
+    "authors": [
+      "Jane Wu",
+      "Yongxu Jin",
+      "Zhenglin Geng",
+      "Hui Zhou",
+      "Ronald Fedkiw"
+    ],
+    "emails": [
+      "jd.com",
+      "stanford.edu",
+      "~Jane_Wu2",
+      "~Ronald_Fedkiw1"
+    ],
+    "rank": 2568
+  },
+  {
+    "url": "https://openreview.net/forum?id=7AQUzh5ntX_",
+    "ratings": [
+      6,
+      4,
+      4,
+      3
+    ],
+    "rating": "4.14",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "First-person object-interaction tasks in high-fidelity, 3D, simulated environments such as the AI2Thor virtual home-environment pose significant sample-efficiency challenges for reinforcement learning (RL) agents learning from sparse task rewards. To alleviate these challenges, prior work has provided extensive supervision via a combination of reward-shaping, ground-truth object-information, and expert demonstrations. In this work, we show that one can learn object-interaction tasks from scratch without supervision by learning an attentive object-model as an auxiliary task during task learning with an object-centric relational RL agent. Our key insight is that learning an object-model that incorporates object-relationships into forward prediction provides a dense learning signal for unsupervised representation learning of both objects and their relationships. This, in turn, enables faster policy learning for an object-centric relational RL agent. We demonstrate our agent by introducing a set of challenging object-interaction tasks in the AI2Thor environment where learning with our attentive object-model is key to strong performance. Specifically, by comparing our agent and relational RL agents with alternative auxiliary tasks with a relational RL agent equipped with ground-truth object-information, we find that learning with our object-model best closes the performance gap in terms of both learning speed and maximum success rate. Additionally, we find that incorporating object-relationships into an object-model's forward predictions is key to learning representations that capture object-category and object-state.",
+    "title": "Reinforcement Learning for Sparse-Reward Object-Interaction Tasks in First-person Simulated 3D Environments",
+    "authors": [
+      "Wilka Torrico Carvalho",
+      "Anthony Liang",
+      "Kimin Lee",
+      "Sungryull Sohn",
+      "Honglak Lee",
+      "Richard Lewis",
+      "Satinder Singh"
+    ],
+    "emails": [
+      "~Satinder_Singh2",
+      "~Kimin_Lee1",
+      "~Sungryull_Sohn1",
+      "~Richard_Lewis1",
+      "~Honglak_Lee2",
+      "~Wilka_Torrico_Carvalho1",
+      "umich.edu"
+    ],
+    "rank": 2569
+  },
+  {
+    "url": "https://openreview.net/forum?id=t5lNr0Lw84H",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.14",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "We benchmark commonly used multi-agent deep reinforcement learning (MARL) algorithms on a variety of cooperative multi-agent games. While there has been significant innovation in MARL algorithms, algorithms tend to be tested and tuned on a single domain and their average performance across multiple domains is less characterized. Furthermore, since the hyperparameters of the algorithms are carefully tuned to the task of interest, it is unclear whether hyperparameters can easily be found that allow the algorithm to be repurposed for other cooperative tasks with different reward structure and environment dynamics. To investigate the consistency of the performance of MARL algorithms, we build an open-source library of multi-agent algorithms including DDPG/TD3/SAC with centralized Q functions, PPO with centralized value functions, and QMix and test them across a range of tasks that vary in coordination difficulty and agent number. The domains include the particle-world environments, starcraft micromanagement challenges, the Hanabi challenge, and the hide-and-seek environments. Finally, we investigate the ease of hyper-parameter tuning for each of the algorithms by tuning hyper-parameters in one environment per domain and re-using them in the other environments within the domain.",
+    "title": "Benchmarking Multi-Agent Deep Reinforcement Learning Algorithms",
+    "authors": [
+      "Chao Yu",
+      "Akash Velu",
+      "Eugene Vinitsky",
+      "Yu Wang",
+      "Alexandre Bayen",
+      "Yi Wu"
+    ],
+    "emails": [
+      "mails.tsinghua.edu.cn",
+      "~Yu_Wang3",
+      "~Alexandre_Bayen2",
+      "~Eugene_Vinitsky1",
+      "berkeley.edu",
+      "~Yi_Wu1"
+    ],
+    "rank": 2570
+  },
+  {
+    "url": "https://openreview.net/forum?id=4QpDyzCoH01",
+    "ratings": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.14",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "The exceptional success of deep learning comes at the cost of long training sessions, and a slow iterative process of  proposing new architectures that have to be hand-engineered through years of experience. Neural Architecture Search (NAS) is the line of research that tries to automatically design architectures with better performances at a given task. The performance of a network in a task can be predicted by a score, even before the network is trained: this is referred to as zero-shot NAS. However, the existing score remains unreliable for architectures with high accuracy. We develop in this direction by exploring different related scores. We study their time efficiency and we improve on their dependence with the final accuracy, especially for high values of the score.  We propose a monotonicity metric to evaluate the adequate relative scoring of the architectures, as a way to avoid imposing a linearity assumption too early. We find that our use of noise improves the score, but a more substantial improvement comes when the evaluation of the score is done in the parameter space. We hope this effort will help clarify promising directions to speed up automatic discovery of good neural architectures without training.",
+    "title": "Improving Zero-Shot Neural Architecture Search with Parameters Scoring",
+    "authors": [
+      "Luca Celotti",
+      "Ismael Balafrej",
+      "Emmanuel Calvet"
+    ],
+    "emails": [
+      "usherbrooke.ca",
+      "~Luca_Celotti1"
+    ],
+    "rank": 2571
+  },
+  {
+    "url": "https://openreview.net/forum?id=Hr-cI3LMKb8",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      6
+    ],
+    "rating": "4.14",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Identifying the dominant factors of variation across a dataset is a central goal of representation learning. Generative approaches lead to descriptions that are rich enough to recreate the data, but often only a partial description is needed to complete downstream tasks or to gain insights about the dataset.  In this work, we operate in the setting where limited information is known about the data in the form of groupings, or set membership, and the task is to learn representations which isolate the factors of variation that are common across the groupings.  Our key insight is the use of affinity cycle consistency (ACC) between the learned embeddings of images belonging to different sets. In contrast to prior work, we demonstrate that ACC can be applied with significantly fewer constraints on the factors of variation, across a remarkably broad range of settings, and without any supervision for half of the data. By curating datasets from Shapes3D, we quantify the effectiveness of ACC through mutual information between the learned representations and the known generative factors. In addition, we demonstrate the applicability of ACC to the tasks of digit style isolation and synthetic-to-real object pose transfer and compare to generative approaches utilizing the same supervision.",
+    "title": "Leveraging affinity cycle consistency to isolate factors of variation in learned representations",
+    "authors": [
+      "Kieran A Murphy",
+      "Varun Jampani",
+      "Srikumar Ramalingam",
+      "Ameesh Makadia"
+    ],
+    "emails": [
+      "~Kieran_A_Murphy1",
+      "~Ameesh_Makadia1",
+      "~Srikumar_Ramalingam2",
+      "~Varun_Jampani2"
+    ],
+    "rank": 2572
+  },
+  {
+    "url": "https://openreview.net/forum?id=To4Wy2NEM2",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.14",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "We develop a novel framework that adds the regularizers to a family of adaptive optimizers in deep learning, such as MOMENTUM, ADAGRAD, ADAM, AMSGRAD, ADAHESSIAN, and create a new class of optimizers, which are named GROUP MOMENTUM, GROUP ADAGRAD, GROUP ADAM, GROUP AMSGRAD and GROUP ADAHESSIAN, etc., accordingly. We establish theoretically proven convergence guarantees in the stochastic convex settings, based on primal-dual methods. We evaluate the regularized effect of our new optimizers on three large-scale real-world ad click datasets with state-of-the-art deep learning models. The experimental results reveal that compared with the original optimizers with the post-processing procedure which use the magnitude pruning method, the performance of the models can be significantly improved on the same sparsity level. Furthermore, in comparison to the cases without magnitude pruning, our methods can achieve extremely high sparsity with significantly better or highly competitive performance.",
+    "title": "Adaptive Optimizers with Sparse Group Lasso",
+    "authors": [
+      "Yun Yue",
+      "Suo Tong",
+      "Zhen Zhang",
+      "Yongchao Liu",
+      "Chunyang Wen",
+      "Huanjun Bao",
+      "Jinjie Gu",
+      "Yixiang Mu"
+    ],
+    "emails": [
+      "~Yongchao_Liu2",
+      "antgroup.com",
+      "~Suo_Tong1",
+      "~Yun_Yue3"
+    ],
+    "rank": 2573
+  },
+  {
+    "url": "https://openreview.net/forum?id=9DQ0SdY4UIz",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.14",
+    "confidences": [
+      5,
+      2,
+      3,
+      4
+    ],
+    "abstract": "We propose a novel local Subspace Indexing Model with Interpolation (SIM-I) for low-dimensional embedding of image datasets. Our SIM-I is constructed via two steps: in the first step we build a piece-wise linear affinity-aware subspace model under a given partition of the dataset; in the second step we interpolate between several adjacent linear subspace models constructed previously using the `\"center of mass\" calculation on Stiefel and Grassmann manifolds. The resulting subspace indexing model built by SIM-I is a globally non-linear low-dimensional embedding of the original data set. Furthermore, the interpolation step produces a `\"smoothed\u201d version of the piece-wise linear embedding mapping constructed in the first step, and can be viewed as a regularization procedure. We provide experimental results validating the effectiveness of SIM-I, that improves PCA recovery for SIFT dataset and nearest-neighbor classification success rates for MNIST and CIFAR-10 datasets. ",
+    "title": "Effective Subspace Indexing via Interpolation on Stiefel and Grassmann manifolds",
+    "authors": [
+      "Wenqing Hu",
+      "Tiefeng Jiang",
+      "Zhu Li"
+    ],
+    "emails": [
+      "~Wenqing_Hu1",
+      "~Zhu_Li2",
+      "~Tiefeng_Jiang1"
+    ],
+    "rank": 2574
+  },
+  {
+    "url": "https://openreview.net/forum?id=3FAl0W6gZ_e",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.14",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We tackle the challenging problem of creating full and accurate three dimensional reconstructions of botanical trees with the topological and geometric accuracy required for subsequent physical simulation, e.g. in response to wind forces. Although certain aspects of our approach would benefit from various improvements, our results exceed the state of the art especially in geometric and topological complexity and accuracy. Starting with two dimensional RGB image data acquired from cameras attached to drones, we create point clouds, textured triangle meshes, and a simulatable and skinned cylindrical articulated rigid body model. We discuss the pros and cons of each step of our pipeline, and in order to stimulate future research we make the raw and processed data from every step of the pipeline as well as the final geometric reconstructions publicly available.",
+    "title": "Three Dimensional Reconstruction of Botanical Trees with Simulatable Geometry",
+    "authors": [
+      "Ed Quigley",
+      "Winnie Lin",
+      "Yilin Zhu",
+      "Ronald Fedkiw"
+    ],
+    "emails": [
+      "~Ed_Quigley1",
+      "~Winnie_Lin1",
+      "~Yilin_Zhu1",
+      "~Ronald_Fedkiw1"
+    ],
+    "rank": 2575
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rw_vo-wIAa",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.14",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Cooperative multi-agent tasks require agents to deduce their own contributions with shared global rewards, known as the challenge of credit assignment. General methods for policy based multi-agent reinforcement learning to solve the challenge introduce differentiate value functions or advantage functions for individual agents. In multi-agent system, polices of different agents need to be evaluated jointly. In order to update polices synchronously, such value functions or advantage functions also need synchronous evaluation. However, in current methods, value functions or advantage functions use counter-factual joint actions which are evaluated asynchronously, thus suffer from natural estimation bias. In this work, we propose the approximatively synchronous advantage estimation. We first derive the marginal advantage function, an expansion from single-agent advantage function to multi-agent system. Further more, we introduce a policy approximation for synchronous advantage estimation, and break down the multi-agent policy optimization problem into multiple sub-problems of single-agent policy optimization. Our method is compared with baseline algorithms on StarCraft multi-agent challenges, and shows the best performance on most of the tasks.",
+    "title": "Multi-agent Policy Optimization with Approximatively Synchronous Advantage Estimation",
+    "authors": [
+      "Lipeng Wan",
+      "Xuwei Song",
+      "Xuguang Lan",
+      "Nanning Zheng"
+    ],
+    "emails": [
+      "~Lipeng_Wan1",
+      "stu.xjtu.edu.cn",
+      "~Nanning_Zheng1",
+      "~Xuguang_Lan2"
+    ],
+    "rank": 2576
+  },
+  {
+    "url": "https://openreview.net/forum?id=3JI45wPuReY",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.14",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "With ever increasing computational capacities, neural networks become more and more proficient at solving complex tasks. However, picking a sufficiently good network topology usually relies on expert human knowledge. Neural architecture search aims to reduce the extent of expertise that is needed. Modern architecture search techniques often rely on immense computational power, or apply trained meta controllers for decision making. We develop a framework for a genetic algorithm that is both computationally cheap and makes decisions based on mathematical criteria rather than trained parameters. It is a hybrid approach that fuses training and topology optimization together into one process. Structural modifications that are performed include adding or removing layers of neurons, with some re-training applied to make up for incurred change in input-output behaviour. Our ansatz is tested on both the SVHN and (augmented) CIFAR-10 datasets with limited computational overhead compared to training only the baseline. This algorithm can achieve a significant increase in accuracy (as compared to a fully trained baseline), rescue insufficient topologies that in their current state are only able to learn to a limited extent, and dynamically reduce network size without loss in achieved accuracy.",
+    "title": "Neural Network Surgery: Combining Training with Topology Optimization",
+    "authors": [
+      "Elisabeth Schiessler",
+      "Roland Aydin",
+      "Kevin Linka",
+      "Christian Cyron"
+    ],
+    "emails": [
+      "hzg.de",
+      "~Elisabeth_Schiessler1",
+      "tuhh.de",
+      "~Roland_Aydin1"
+    ],
+    "rank": 2577
+  },
+  {
+    "url": "https://openreview.net/forum?id=DUbd4PNhlg",
+    "ratings": [
+      5,
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "4.14",
+    "confidences": [
+      4,
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Over-parametrization has become a popular technique in deep learning.  It is observed that by over-parametrization, a larger neural network needs a fewer training iterations than a smaller one to achieve a certain level of performance --- namely, over-parametrization leads to acceleration in optimization. However, despite that over-parametrization is widely used nowadays, little theory is available to explain the acceleration due to over-parametrization. In this paper, we propose understanding it by studying a simple problem first. Specifically, we consider the setting that there is a single teacher neuron with quadratic activation, where over-parametrization is realized by having multiple student neurons learn the data generated from the teacher neuron. We provably show that over-parametrization helps the iterate generated by gradient descent to enter the neighborhood of a global optimal solution that achieves zero testing error faster. On the other hand, we also point out an issue regarding the necessity of over-parametrization and study how the scaling of the output neurons affects the convergence time.\n\n",
+    "title": "Understanding How Over-Parametrization Leads to Acceleration: A case of learning a single teacher neuron",
+    "authors": [
+      "Jun-Kun Wang",
+      "Jacob Abernethy"
+    ],
+    "emails": [
+      "~Jun-Kun_Wang1",
+      "~Jacob_Abernethy1"
+    ],
+    "rank": 2578
+  },
+  {
+    "url": "https://openreview.net/forum?id=Hw2Za4N5hy0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      3
+    ],
+    "rating": "4.14",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": " In the federated learning paradigm, multiple mobile clients train local models independently based on datasets generated by edge devices, and the server aggregates parameters/gradients from local models to form a global model.   However, existing model aggregation approaches suffer from high bias on both data distribution   and parameter distribution for non-IID datasets,    which result in severe accuracy drop for increasing number of heterogeneous clients.    In this paper, we proposed a novel decoupled probabilistic-weighted gradient aggregation approach called FeDEC for federated learning.  The key idea is to optimize gradient parameters and statistical parameters in a decoupled way, and aggregate the parameters from local models with probabilistic weights to deal with the heterogeneity of clients.  Since the overall dataset is unaccessible by the central server,   we introduce a variational inference method to derive the optimal probabilistic weights to minimize statistical bias.   We further prove the convergence bound of the proposed approach.   Extensive experiments using mainstream convolutional neural network models based on three federated datasets  show that FeDEC significantly outperforms the state-of-the-arts in terms of model accuracy and training efficiency.",
+    "title": "Federated Learning with Decoupled Probabilistic-Weighted Gradient Aggregation",
+    "authors": [
+      "Jian-hui Duan",
+      "Wenzhong Li",
+      "Sanglu Lu"
+    ],
+    "emails": [
+      "~Jian-hui_Duan1",
+      "~Sanglu_Lu1",
+      "~Wenzhong_Li1"
+    ],
+    "rank": 2579
+  },
+  {
+    "url": "https://openreview.net/forum?id=C1VUD8RZ5wq",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.14",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Codistillation has been proposed as a mechanism to share knowledge among concurrently trained models by encouraging them to represent the same function through an auxiliary loss. This contrasts with the more commonly used fully-synchronous data-parallel stochastic gradient descent methods, where different model replicas average their gradients (or parameters) at every iteration and thus maintain identical parameters. We investigate codistillation in a distributed training setup, complementing previous work which focused on extremely large batch sizes. Surprisingly, we find that even at moderate batch sizes, models trained with codistillation can perform as well as models trained with synchronous data-parallel methods, despite using a much weaker synchronization mechanism. These findings hold across a range of batch sizes and learning rate schedules, as well as different kinds of models and datasets. Obtaining this level of accuracy, however, requires properly accounting for the regularization effect of codistillation, which we highlight through several empirical observations. Overall, this work contributes to a better understanding of codistillation and how to best take advantage of it in a distributed computing environment.",
+    "title": "A Closer Look at Codistillation for Distributed Training",
+    "authors": [
+      "Shagun Sodhani",
+      "Olivier Delalleau",
+      "Mido Assran",
+      "Koustuv Sinha",
+      "Nicolas Ballas",
+      "Michael Rabbat"
+    ],
+    "emails": [
+      "~Michael_Rabbat1",
+      "~Nicolas_Ballas1",
+      "~Mido_Assran1",
+      "~Koustuv_Sinha1",
+      "~Olivier_Delalleau1",
+      "~Shagun_Sodhani1"
+    ],
+    "rank": 2580
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y9NIGVYXTuz",
+    "ratings": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.13",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "A key challenge of dialog systems research is to effectively and efficiently adapt to new domains. A scalable paradigm for adaptation necessitates the development of generalizable models that perform well in few-shot settings. In this paper, we focus on the intent classification problem which aims to identify user intents given utterances addressed to the dialog system. We propose two approaches for improving the generalizability of utterance classification models: (1) example-driven training and (2) observers. Example-driven training learns to  classify utterances by comparing to examples, thereby using the underlying encoder as a sentence similarity model. Prior work has shown that BERT-like models tend to attribute a significant amount of attention to the [CLS] token, which we hypothesize results in diluted representations. Observers are tokens that are not attended to, and are an alternative to the [CLS] token. The proposed methods attain state-of-the-art results on three intent prediction datasets (banking77, clinc150, hwu64) in both the full data and few-shot (10 examples per intent) settings. Furthermore, we demonstrate that the proposed approach can transfer to new intents and across datasets without any additional training. ",
+    "title": "Example-Driven Intent Prediction with Observers",
+    "authors": [
+      "Shikib Mehri",
+      "Mihail Eric",
+      "Dilek Hakkani-Tur"
+    ],
+    "emails": [
+      "~Mihail_Eric2",
+      "~Dilek_Hakkani-Tur1",
+      "~Shikib_Mehri1"
+    ],
+    "rank": 2581
+  },
+  {
+    "url": "https://openreview.net/forum?id=JE7a-YejzfN",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.13",
+    "confidences": [
+      4,
+      5,
+      4,
+      2
+    ],
+    "abstract": "  A growing body of recent evidence has highlighted the limitations of natural language processing (NLP) datasets and classifiers. These include the presence of annotation artifacts in datasets, classifiers relying on shallow features like a single word (e.g., if a movie review has the word \"romantic\", the review tends to be positive), or unnecessary words (e.g., learning a proper noun to classify a movie as positive or negative). The presence of such artifacts has subsequently led to the development of challenging datasets to force the model to generalize better. While a variety of heuristic strategies, such as counterfactual examples and contrast sets, have been proposed, the theoretical justification about what makes these examples difficult for the classifier is often lacking or unclear. In this paper, using tools from information geometry, we propose a theoretical way to quantify the difficulty of an example in NLP. Using our approach, we explore difficult examples for several deep learning architectures. We discover that BERT, CNN and fasttext are susceptible to word substitutions in high difficulty examples. These classifiers tend to perform poorly on the FIM test set. (generated by sampling and perturbing difficult examples, with accuracy dropping below 50%). We replicate our experiments on 5 NLP datasets (YelpReviewPolarity, AGNEWS, SogouNews, YelpReviewFull and Yahoo Answers). On YelpReviewPolarity we observe a correlation coefficient of -0.4 between resilience to perturbations and the difficulty score. Similarly we observe a correlation of 0.35 between the difficulty score and the empirical success probability of random substitutions. Our approach is simple, architecture agnostic and can be used to study the fragilities of text classification models. All the code used will be made publicly available, including a tool to explore the difficult examples for other datasets.\n  ",
+    "title": "Geometry matters: Exploring language examples at the decision boundary",
+    "authors": [
+      "Debajyoti Datta",
+      "Shashwat Kumar",
+      "Laura Barnes",
+      "Tom Fletcher"
+    ],
+    "emails": [
+      "~Debajyoti_Datta1",
+      "virginia.edu",
+      "~Tom_Fletcher1",
+      "~Laura_Barnes1"
+    ],
+    "rank": 2582
+  },
+  {
+    "url": "https://openreview.net/forum?id=inBTt_wSv0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      3,
+      4
+    ],
+    "rating": "4.13",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "The use of Deep Neural Networks (DNNs) as function approximators has led to striking progress for reinforcement learning algorithms and applications. At the same time, deep reinforcement learning agents have inherited the vulnerability of DNNs to imperceptible adversarial perturbations to their inputs. Prior work on adversarial perturbations for deep reinforcement learning has generally relied on calculating an adversarial perturbation customized to each state visited by the agent. In this paper we propose a more realistic threat model in which the adversary computes the perturbation only once based on a single state. Furthermore, we show that to cause a deep reinforcement learning agent to fail it is enough to have only one adversarial offset vector in a black-box setting. We conduct experiments in various games from the Atari environment, and use our single-state adversaries to demonstrate the transferability of perturbations both between states of one MDP, and between entirely different MDPs. We believe our adversary framework reveals fundamental properties of the environments used in deep reinforcement learning training, and is a tangible step towards building robust and reliable deep reinforcement learning agents.",
+    "title": "Exploring Transferability of Perturbations in Deep Reinforcement Learning",
+    "authors": [
+      "Ezgi Korkmaz"
+    ],
+    "emails": [
+      "~Ezgi_Korkmaz1"
+    ],
+    "rank": 2583
+  },
+  {
+    "url": "https://openreview.net/forum?id=FPpZrRfz6Ss",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "rating": "4.13",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Meta learning, an effective way for learning unseen tasks with few samples, is an important research\narea in machine learning.\nModel Agnostic Meta-Learning~(MAML)~(\\cite{finn2017model}) is one of the most well-known gradient-based meta learning algorithms, that learns\nthe meta-initialization through the inner and outer optimization loop.\nThe inner loop is to perform fast adaptation in several gradient update steps with the support datapoints, \nwhile the outer loop to generalize the updated model to the query datapoints.\nRecently, it has been argued that instead of rapid learning and adaptation, the learned meta-initialization through MAML\nhas already absorbed the high-quality features prior, where the task-specific head at training \nfacilitates the feature learning.\nIn this work, we investigate the impact of the task-specific adaptation of MAML and discuss the general formula for\nother gradient-based and metric-based meta-learning approaches.\nFrom our analysis, we further devise the Random Decision Planes~(RDP) algorithm to find a suitable linear classifier\nwithout any gradient descent step and the Meta Contrastive Learning~(MCL) algorithm to exploit the inter-samples relationship\ninstead of the expensive inner-loop adaptation. \nWe conduct sufficient experiments on various datasets to explore our proposed algorithms.",
+    "title": "To Learn Effective Features: Understanding the Task-Specific Adaptation of MAML",
+    "authors": [
+      "Zhijie Lin",
+      "Zhou Zhao",
+      "Zhu Zhang",
+      "Huai Baoxing",
+      "Jing Yuan"
+    ],
+    "emails": [
+      "~Zhijie_Lin1",
+      "~Zhu_Zhang3",
+      "huawei.com",
+      "~Zhou_Zhao2"
+    ],
+    "rank": 2584
+  },
+  {
+    "url": "https://openreview.net/forum?id=loe6h28yoq",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.12",
+    "confidences": [
+      4,
+      2,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Data poisoning attacks aim to corrupt a machine learning model via modifying, adding, and/or removing some carefully selected training examples, such that the corrupted model predicts any or attacker-chosen incorrect labels for testing examples. The key idea of state-of-the-art certified defenses against data poisoning attacks is to create a \\emph{majority vote} mechanism to predict the label of a testing example. Moreover, each voter is a base classifier trained on a subset of the training dataset. Nearest neighbor algorithms such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> nearest neighbors (kNN) and radius nearest neighbors (rNN) have  intrinsic majority vote mechanisms. In this work, we show that the intrinsic majority vote mechanisms in kNN and rNN already provide certified robustness guarantees against general data poisoning attacks. Moreover, our empirical evaluation results on MNIST and CIFAR10 show that the intrinsic certified robustness guarantees of kNN and rNN outperform those provided by state-of-the-art certified defenses. ",
+    "title": "Certified Robustness of Nearest Neighbors against Data Poisoning Attacks",
+    "authors": [
+      "Jinyuan Jia",
+      "Xiaoyu Cao",
+      "Neil Zhenqiang Gong"
+    ],
+    "emails": [
+      "~Xiaoyu_Cao1",
+      "~Neil_Zhenqiang_Gong1",
+      "~Jinyuan_Jia2"
+    ],
+    "rank": 2585
+  },
+  {
+    "url": "https://openreview.net/forum?id=xcd5iTC6J-W",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.12",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Hidden Markov models (HMMs) are commonly used for disease progression modeling when the true state of a patient is not fully known. Since HMMs may have multiple local optima, performance can be improved by incorporating additional patient covariates to inform parameter estimation. To allow for this, we formulate a special case of recurrent neural networks (RNNs), which we name hidden Markov recurrent neural networks (HMRNNs), and prove that each HMRNN has the same likelihood function as a corresponding discrete-observation HMM. As a neural network, the HMRNN can also be combined with any other predictive neural networks that take patient covariate information as input. We first show that parameter estimates from HMRNNs are numerically close to those obtained from HMMs via the Baum-Welch algorithm, thus empirically validating their theoretical equivalence. We then demonstrate how the HMRNN can be combined with other neural networks to improve parameter estimation and prediction, using an Alzheimer's disease dataset. The HMRNN yields parameter estimates that improve disease forecasting performance and offer a novel clinical interpretation compared with a standard HMM.",
+    "title": "Hidden Markov models are recurrent neural networks: A disease progression modeling application",
+    "authors": [
+      "Matthew Baucum",
+      "Anahita Khojandi",
+      "Theodore Papamarkou"
+    ],
+    "emails": [
+      "~Theodore_Papamarkou1",
+      "~Anahita_Khojandi1",
+      "~Matthew_Baucum1"
+    ],
+    "rank": 2586
+  },
+  {
+    "url": "https://openreview.net/forum?id=yZBuYjD8Gd",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      2,
+      5
+    ],
+    "rating": "4.12",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Self-supervised learning has recently begun to rival supervised learning on computer vision tasks. Many of the recent approaches have been based on contrastive instance discrimination (CID), in which the network is trained to recognize two augmented versions of the same instance (a query and positive while discriminating against a pool of other instances (negatives). Using MoCo v2 as our testbed, we divided negatives by their difficulty for a given query and studied which difficulty ranges were most important for learning useful representations. We found that a small minority of negatives--just the hardest 5%--were both necessary and sufficient for the downstream task to reach full accuracy. Conversely, the easiest 95% of negatives were unnecessary and insufficient. Moreover, we found that the very hardest 0.1% of negatives were not only unnecessary but also detrimental. Finally, we studied the properties of negatives that affect their hardness, and found that hard negatives were more semantically similar to the query, and that some negatives were more consistently easy or hard than we would expect by chance. Together, our results indicate that negatives play heterogeneous roles and CID may benefit from more intelligent negative treatment.",
+    "title": "Are all negatives created equal in contrastive instance discrimination?",
+    "authors": [
+      "Tiffany Cai",
+      "Jonathan Frankle",
+      "David J. Schwab",
+      "Ari S. Morcos"
+    ],
+    "emails": [
+      "~Jonathan_Frankle1",
+      "gmail.com",
+      "~David_J._Schwab1",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 2587
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZHJlKWN57EQ",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      6,
+      3
+    ],
+    "rating": "4.12",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "State-of-the-art generic low-precision training algorithms use a mix of 16-bit and 32-bit precision, creating the folklore that 16-bit precision alone is not enough to maximize model accuracy. As a result, deep learning accelerators are forced to support both 16-bit and 32-bit compute units which is more costly than only using 16-bit units for hardware design. We ask can we do pure 16-bit training which requires only 16-bit compute units, while still matching the model accuracy attained by 32-bit training. Towards this end, we study pure 16-bit training algorithms on the widely adopted BFloat16 compute unit. While these units conventionally use nearest rounding to cast output to 16-bit precision, we show that nearest rounding for model weight updates can often cancel small updates, which degrades the convergence and model accuracy. Motivated by this, we identify two simple existing techniques, stochastic rounding and the Kahan accumulation, to remedy the model accuracy degradation in pure 16-bit training. We empirically show that these two techniques can enable up to 7% absolute validation accuracy gain in pure 16-bit training. This leads to 0.1% lower to 0.2% higher matching validation accuracy compared to 32-bit precision training across seven deep learning applications. ",
+    "title": "Revisiting BFfloat16 Training",
+    "authors": [
+      "Pedram Zamirai",
+      "Jian Zhang",
+      "Christopher R Aberger",
+      "Christopher De Sa"
+    ],
+    "emails": [
+      "~Christopher_De_Sa2",
+      "~Christopher_R_Aberger1",
+      "~Pedram_Zamirai1",
+      "~Jian_Zhang1"
+    ],
+    "rank": 2588
+  },
+  {
+    "url": "https://openreview.net/forum?id=iTeUSEw5rl2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "rating": "4.12",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Existing continual learning benchmarks often assume each task's training and test data are from the same distribution, which may not hold in practice. Towards making continual learning practical, in this paper, we introduce a novel setting of online continual learning under conditional domain shift, in which domain shift exists between training and test data of all tasks: <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.055em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D44C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2260\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em; margin-left: 0.055em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D44C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>P</mi><mrow><mi>t</mi><mi>r</mi></mrow></msup><mo stretchy=\"false\">(</mo><mi>X</mi><mo>,</mo><mi>Y</mi><mo stretchy=\"false\">)</mo><mo>\u2260</mo><msup><mi>P</mi><mrow><mi>t</mi><mi>e</mi></mrow></msup><mo stretchy=\"false\">(</mo><mi>X</mi><mo>,</mo><mi>Y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, and the model is required to generalize to unseen domains at test time. To address this problem, we propose \\emph{Conditional Invariant Experience Replay (CIER)} that can simultaneously retain old knowledge, acquire new information, and generalize to unseen domains. CIER employs an adversarial training to correct the shift in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D44C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>P</mi><mo stretchy=\"false\">(</mo><mi>X</mi><mo>,</mo><mi>Y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> by matching <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D443 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44B TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44C TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>P</mi><mo stretchy=\"false\">(</mo><mi>X</mi><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mi>Y</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, which results in an invariant representation that can generalize to unseen domains during inference. Our extensive experiments show that CIER can bridge the domain gap in continual learning and significantly outperforms state-of-the-art methods. We will release our benchmarks and implementation upon acceptance.",
+    "title": "Online Continual Learning  Under  Domain Shift",
+    "authors": [
+      "Quang Pham",
+      "Chenghao Liu",
+      "Steven HOI"
+    ],
+    "emails": [
+      "~Steven_HOI1",
+      "~Quang_Pham1",
+      "~Chenghao_Liu1"
+    ],
+    "rank": 2589
+  },
+  {
+    "url": "https://openreview.net/forum?id=qOCdZn3lQIJ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      4,
+      6
+    ],
+    "rating": "4.12",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We propose SignXOR, a novel compression scheme that exploits temporal correlation of gradients for the purpose of gradient compression. Sign-based schemes such as Scaled-sign and SignSGD (Bernstein et al., 2018; Karimireddy et al., 2019) compress gradients by storing only the sign of gradient entries. These methods, however, ignore temporal correlations between gradients. The equality or non-equality of signs of gradients in two consecutive iterations can be represented by a binary vector, which can be further compressed depending on its entropy. By implementing a rate-distortion encoder we increase the temporal correlation of gradients, lowering entropy and improving compression. We achieve theoretical convergence of SignXOR by employing the two-way error-feedback approach introduced by Zheng et al. (2019). Zheng et al. (2019) show that two-way compression with error-feedback achieves the same asymptotic convergence rate as SGD, although convergence is slower by a constant factor. We strengthen their analysis to show that the rate of convergence of two-way compression with errorfeedback asymptotically is the same as that of SGD. As a corollary we prove that two-way SignXOR compression with error-feedback achieves the same asymptotic rate of convergence as SGD. We numerically evaluate our proposed method on the CIFAR-100 and ImageNet datasets and show that SignXOR requires less than 50% of communication traffic compared to sending sign of gradients. To the best of our knowledge we are the first to present a gradient compression scheme that exploits temporal correlation of gradients.",
+    "title": "Compressing gradients in distributed SGD by exploiting their temporal correlation",
+    "authors": [
+      "Tharindu Adikari",
+      "Stark Draper"
+    ],
+    "emails": [
+      "~Stark_Draper1",
+      "~Tharindu_Adikari1"
+    ],
+    "rank": 2590
+  },
+  {
+    "url": "https://openreview.net/forum?id=zWvMjL6o60V",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      8,
+      4,
+      4
+    ],
+    "rating": "4.11",
+    "confidences": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Mutual Information (MI) based feature selection makes use of MI to evaluate each feature and eventually shortlist a relevant feature subset, in order to address issues associated with high-dimensional datasets.\nDespite the effectiveness of MI in feature selection, we have noticed that many state-of-the-art algorithms disregard the so-called unique relevance (UR) of features, which is a necessary condition for the optimal feature subset. In fact, in our study of seven state-of-the-art and classical MIBFS algorithms, we find that all of them underperform as they ignore UR of features and arrive at a suboptimal selected feature subset which contains a non-negligible number of redundant features. We point out that the heart of the problem is that all these MIBFS algorithms follow the criterion of Maximize Relevance with Minimum Redundancy (MRwMR), which does not explicitly target UR. This motivates us to augment the existing criterion with the objective of boosting unique relevance (BUR), leading to a new criterion called MRwMR-BUR. We conduct extensive experiments with several MIBFS algorithms with and without incorporating UR. The results indicate that the algorithms that boost UR consistently outperform their unboosted counterparts in terms of peak accuracy and number of features required. Furthermore, we propose a classifier based approach to estimate UR that further improves the performance of MRwMR-BUR based algorithms.",
+    "title": "Improving Mutual Information based Feature Selection by Boosting Unique Relevance",
+    "authors": [
+      "Shiyu Liu",
+      "Mehul Motani"
+    ],
+    "emails": [
+      "~Mehul_Motani1",
+      "~Shiyu_Liu1"
+    ],
+    "rank": 2591
+  },
+  {
+    "url": "https://openreview.net/forum?id=n5yBuzpqqw",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      3,
+      5
+    ],
+    "rating": "4.11",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "In the reinforcement learning (RL) algorithms which incorporate function approximation methods, the approximation error of value function inevitably cause overestimation phenomenon and have a negative impact on the convergence of the algorithms. To mitigate the negative effects of approximation error, we propose a new actor-critic algorithm called Error Controlled Actor-critic which ensures confining the approximation error in value function. In this paper, we firstly present an analysis of how the approximation error can hinder the optimization process of actor-critic methods. Then, we *derive an upper boundary of the approximation error of Q function approximator, and found that the error can be lowered by placing restrictions on the KL-divergence between every two consecutive policies during the training phase of the policy.* The results of experiments on a range of continuous control tasks from OpenAI gym suite demonstrate that the proposed actor-critic algorithm apparently reduces the approximation error and significantly outperforms other model-free RL algorithms.",
+    "title": "Error Controlled Actor-Critic Method to Reinforcement Learning",
+    "authors": [
+      "Xingen Gao",
+      "Fei Chao",
+      "Changle Zhou",
+      "Zhen Ge",
+      "Chih-Min Lin",
+      "Longzhi Yang",
+      "Xiang Chang",
+      "Changjing Shang"
+    ],
+    "emails": [
+      "saturn.yzu.edu.tw",
+      "xmu.edu.cn",
+      "aber.ac.uk",
+      "northumbria.ac.uk",
+      "student.uts.edu.au",
+      "stu.xmu.edu.cn"
+    ],
+    "rank": 2592
+  },
+  {
+    "url": "https://openreview.net/forum?id=wiSgdeJ29ee",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.11",
+    "confidences": [
+      4,
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In offline reinforcement learning (RL), we attempt to learn a control policy from a fixed dataset of environment interactions. This setting has the potential benefit of allowing us to learn effective policies without needing to collect additional interactive data, which can be expensive or dangerous in real-world systems. However, traditional off-policy RL methods tend to perform poorly in this setting due to the distributional shift between the fixed data set and the learned policy. In particular, they tend to extrapolate optimistically and overestimate the action-values outside of the dataset distribution. Recently, two major avenues have been explored to address this issue. First, behavior-regularized methods that penalize actions that deviate from the demonstrated action distribution. Second, uncertainty-aware model-based (MB) methods that discourage state-actions where the dynamics are uncertain. In this work, we propose an algorithmic framework that consists of two stages. In the first stage, we train a policy using behavior-regularized model-free RL on the offline dataset. Then, a second stage where we fine-tune the policy using our novel Model-Based Behavior-Regularized Policy Optimization (MB2PO) algorithm. We demonstrate that for certain tasks and dataset distributions our conservative model-based fine-tuning can greatly increase performance and allow the agent to generalize and outperform the demonstrated behavior. We evaluate our method on a variety of the Gym-MuJoCo tasks in the D4RL benchmark and demonstrate that our method is competitive and in some cases superior to the state of the art for most of the evaluated tasks.",
+    "title": "Fine-Tuning Offline Reinforcement Learning with Model-Based Policy Optimization",
+    "authors": [
+      "Adam Villaflor",
+      "John Dolan",
+      "Jeff Schneider"
+    ],
+    "emails": [
+      "~Jeff_Schneider1",
+      "~John_Dolan1",
+      "~Adam_Villaflor1"
+    ],
+    "rank": 2593
+  },
+  {
+    "url": "https://openreview.net/forum?id=UJRFjuJDsIO",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      6
+    ],
+    "rating": "4.11",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "It has been repeatedly observed that convolutional architectures when applied to image understanding tasks learn oriented bandpass filters. A standard explanation of this result is that these filters reflect the structure of the images that they have been exposed to during training: Natural images typically are locally composed of oriented contours at various scales and oriented bandpass filters are matched to such structure. We offer an alternative explanation based not on the structure of images, but rather on the structure of convolutional architectures. In particular, complex exponentials are the eigenfunctions of convolution. These eigenfunctions are defined globally; however, convolutional architectures operate locally.  To enforce locality, one can apply a windowing function to the eigenfunctions, which leads to oriented bandpass filters as the natural operators to be learned with convolutional architectures. From a representational point of view, these filters allow for a local systematic way to characterize and operate on an image or other signal. We offer empirical support for the hypothesis that convolutional networks learn such filters at all of their convolutional layers. While previous research has shown evidence of filters having oriented bandpass characteristics at early layers, ours appears to be the first study to document the predominance of such filter characteristics at all layers. Previous studies have missed this observation because they have concentrated on the cumulative compositional effects of filtering across layers, while we examine the filter characteristics that are present at each layer.",
+    "title": "Why Convolutional Networks Learn Oriented Bandpass Filters: Theory and Empirical Support",
+    "authors": [
+      "Isma Hadji",
+      "Richard Wildes"
+    ],
+    "emails": [
+      "~Richard_Wildes1",
+      "~Isma_Hadji2"
+    ],
+    "rank": 2594
+  },
+  {
+    "url": "https://openreview.net/forum?id=mOO-LfEVZK",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      1,
+      7,
+      4,
+      5
+    ],
+    "rating": "4.11",
+    "confidences": [
+      3,
+      5,
+      3,
+      3,
+      5
+    ],
+    "abstract": "The problem of defending against adversarial attacks has attracted increasing attention in recent years. While various types of defense methods (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2E TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2E TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">e.g.</mtext></math></mjx-assistive-mml></mjx-container>, adversarial training, detection and rejection, and recovery) were proven empirically to bring robustness to the network, their weakness was shown by later works. Inspired by the observation from the distribution properties of the features extracted by the CNNs in the feature space and their link to robustness, this work designs a novel training process called Manifold-Aware Training (MAT), which forces CNNs to learn compact features to increase robustness.  The effectiveness of the proposed method is evaluated via comparisons with existing defense mechanisms, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2E TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2E TEX-MI\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">i.e.</mtext></math></mjx-assistive-mml></mjx-container>, the TRADES algorithm, which has been recognized as a representative state-of-the-art technology, and the MMC method, which also aims to learn compact features. Further verification is also conducted using the attack adaptive to our method. Experimental results show that MAT-trained CNNs exhibit significantly higher performance than state-of-the-art robustness.",
+    "title": "Manifold-aware Training: Increase Adversarial Robustness with Feature Clustering",
+    "authors": [
+      "Ting-An Yen",
+      "Chun-Shien Lu",
+      "Pau-Choo Chung"
+    ],
+    "emails": [
+      "~Pau-Choo_Chung1",
+      "~Ting-An_Yen1",
+      "~Chun-Shien_Lu1"
+    ],
+    "rank": 2595
+  },
+  {
+    "url": "https://openreview.net/forum?id=uMDbGsVjCS4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "4.10",
+    "confidences": [
+      4,
+      3,
+      3
+    ],
+    "abstract": "We present a new method for reconstructing and refining complex surfaces based on physical simulations.  Taking a roughly approximated simulation as input, our method infers corresponding spatial details while taking into account how they evolve over time.  We consider this problem in terms of spatial and temporal frequencies, and leverage generative adversarial networks to learn the desired spatio-temporal signal for the surface dynamics.  Furthermore, we investigate the possibility to train our network in an unsupervised manner, i.e. without predefined training pairs. We highlight the capabilities of our method with a set of synthetic wave function tests and complex 3D dynamics of elasto-plastic materials.",
+    "title": "Frequency-aware Interface Dynamics with Generative Adversarial Networks",
+    "authors": [
+      "Lukas Prantl",
+      "Tassilo Kugelstadt",
+      "Jan Bender",
+      "Nils Thuerey"
+    ],
+    "emails": [
+      "cs.rwth-aachen.de",
+      "~Jan_Bender1",
+      "~Nils_Thuerey1",
+      "~Lukas_Prantl1"
+    ],
+    "rank": 2596
+  },
+  {
+    "url": "https://openreview.net/forum?id=RayUtcIlGz",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      6,
+      2
+    ],
+    "rating": "4.08",
+    "confidences": [
+      3,
+      3,
+      1,
+      5
+    ],
+    "abstract": "Many types of neural network layers rely on matrix properties such as invertibility or orthogonality.\nRetaining such properties during optimization with gradient-based stochastic optimizers is a challenging task, which is usually addressed by either reparameterization of the affected parameters or by directly optimizing on the manifold.\nThis work presents a novel approach for training invertible linear layers. In lieu of directly optimizing\nthe network parameters, we train rank-one perturbations and add them to the actual weight matrices infrequently. This P<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mrow><mn>4</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>Inv update allows keeping track of inverses and determinants without ever explicitly computing them. We show how such invertible blocks improve the mixing and thus the mode separation of the resulting normalizing flows. Furthermore, we outline how the P<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-n\"></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c34\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi></mi><mn>4</mn></msup></math></mjx-assistive-mml></mjx-container> concept can be utilized to retain properties other than invertibility.",
+    "title": "Training Invertible Linear Layers through Rank-One Perturbations",
+    "authors": [
+      "Andreas Kr\u00e4mer",
+      "Jonas K\u00f6hler",
+      "Frank Noe"
+    ],
+    "emails": [
+      "~Jonas_K\u00f6hler1",
+      "~Frank_Noe1",
+      "~Andreas_Kr\u00e4mer1"
+    ],
+    "rank": 2597
+  },
+  {
+    "url": "https://openreview.net/forum?id=FzGiUKN4aBp",
+    "ratings": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "rating": "4.08",
+    "confidences": [
+      3,
+      2,
+      4,
+      3
+    ],
+    "abstract": "Out-of-Distribution (OOD) generalization is a problem of seeking the predictor function whose performance in the worst environment is optimal.  \nThis paper makes both theoretical and algorithmic contributions to the OOD problem.\nWe consider a set of all invariant features conditioned to which the target variable and the environment variable becomes independent, and theoretically prove that one can seek an OOD optimal predictor by looking for the mutual-information maximizing feature amongst the invariant features. \nWe establish this result as \\textit{Maximal Invariant Predictor condition}. \nOur theoretical work is closely related to approaches like Invariant Risk Minimization and Invariant Rationalization.\nWe also derive from our theory the \\textit{Inter Gradient Alignment}(IGA) algorithm that uses a parametrization trick to conduct \\textit{feature searching} and \\textit{predictor training} at once. \nWe develop an extension of the Colored-MNIST that can more accurately represent the pathological OOD situation than the original version, and demonstrate the superiority of IGA over previous methods on both the original and the extended version of Colored-MNIST. ",
+    "title": "Out-of-Distribution Generalization with Maximal Invariant Predictor",
+    "authors": [
+      "Masanori Koyama",
+      "Shoichiro Yamaguchi"
+    ],
+    "emails": [
+      "~Masanori_Koyama1",
+      "~Shoichiro_Yamaguchi1"
+    ],
+    "rank": 2598
+  },
+  {
+    "url": "https://openreview.net/forum?id=PTG9NdIn3wt",
+    "ratings": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.08",
+    "confidences": [
+      4,
+      3,
+      2,
+      3
+    ],
+    "abstract": "Distributional text clustering delivers semantically informative representations and captures the relevance between each word and semantic clustering centroids. We extend the neural text clustering approach to text classification tasks by inducing cluster centers via a latent variable model and interacting with distributional word embeddings, to enrich the representation of tokens and measure the relatedness between tokens and each learnable cluster centroid. The proposed method jointly learns word clustering centroids and clustering-token alignments, achieving the state of the art results on multiple benchmark datasets and proving that the proposed cluster-token alignment mechanism is indeed favorable to text classification. Notably, our qualitative analysis has conspicuously illustrated that text representations learned by the proposed model are in accord well with our intuition.",
+    "title": "Neural Text Classification by Jointly Learning to Cluster and Align",
+    "authors": [
+      "Yekun Chai",
+      "Haidong Zhang",
+      "Shuo Jin"
+    ],
+    "emails": [
+      "~Yekun_Chai1",
+      "pitt.edu",
+      "yahoo.com"
+    ],
+    "rank": 2599
+  },
+  {
+    "url": "https://openreview.net/forum?id=CVZMcRg_bd",
+    "ratings": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.08",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "The paradigm of differentiable programming has considerably enhanced the scope of machine learning via the judicious use of gradient-based optimization. However, standard differentiable programming methods (such as autodiff) typically require that the models be differentiable, limiting their applicability. We introduce a new, principled approach to extend gradient-based optimization to piecewise smooth models, such as k-histograms, splines, and segmentation maps. We derive an accurate form to the weak Jacobian of such functions, and show that it exhibits a block-sparse structure that can be computed implicitly and efficiently. We show that using the redesigned Jacobian leads to improved performance in applications such as denoising with piecewise polynomial regression models, data-free generative model training, and image segmentation.",
+    "title": "Differentiable Programming for Piecewise Polynomial Functions",
+    "authors": [
+      "Minsu Cho",
+      "Ameya Joshi",
+      "Xian Yeow Lee",
+      "Aditya Balu",
+      "Adarsh Krishnamurthy",
+      "Baskar Ganapathysubramanian",
+      "Soumik Sarkar",
+      "Chinmay Hegde"
+    ],
+    "emails": [
+      "~Xian_Yeow_Lee1",
+      "~Aditya_Balu1",
+      "~Chinmay_Hegde1",
+      "~Ameya_Joshi2",
+      "~Minsu_Cho2",
+      "~Baskar_Ganapathysubramanian1",
+      "~Adarsh_Krishnamurthy1",
+      "~Soumik_Sarkar1"
+    ],
+    "rank": 2600
+  },
+  {
+    "url": "https://openreview.net/forum?id=I6-3mg29P6y",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4
+    ],
+    "rating": "4.08",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Hessian based measures of flatness, such as the trace, Frobenius and spectral norms, have been argued, used and shown to relate to generalisation. In this paper we demonstrate that, for feed-forward neural networks under the cross-entropy loss, low-loss solutions with large neural network weights have small Hessian based measures of flatness. This implies that solutions obtained without L2 regularisation should be less sharp than those with despite generalising worse. We show this to be true for logistic regression, multi-layer perceptrons, simple convolutional, pre-activated and wide residual networks on the MNIST and CIFAR-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>100</mn></math></mjx-assistive-mml></mjx-container> datasets. Furthermore, we show that adaptive optimisation algorithms using iterate averaging, on the VGG-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c36\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>16</mn></math></mjx-assistive-mml></mjx-container> network and CIFAR-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>100</mn></math></mjx-assistive-mml></mjx-container> dataset, achieve superior generalisation to SGD but are <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>30</mn><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container> sharper. These theoretical and experimental results further advocate the need to use flatness in conjunction with the weights scale to measure generalisation \\citep{neyshabur2017exploring,dziugaite2017computing}. ",
+    "title": "Flatness is a False Friend",
+    "authors": [
+      "Diego Granziol"
+    ],
+    "emails": [
+      "~Diego_Granziol1"
+    ],
+    "rank": 2601
+  },
+  {
+    "url": "https://openreview.net/forum?id=H8VDvtm1ij8",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      7
+    ],
+    "rating": "4.08",
+    "confidences": [
+      5,
+      4,
+      3,
+      1
+    ],
+    "abstract": "In machine learning, due to model misspecification and overfitting, estimates of the aleatoric uncertainty are often inaccurate.\nOne approach to fix this is isotonic regression, in which a monotonic function is fit on a validation set to map the model's CDF to an optimally calibrated CDF. However, this makes it infeasible to compute additional statistics of interest on the model distribution (such as the mean). In this paper, through a reframing of recalibration as MLE, we replace isotonic regression with normalizing flows. This allows us to retain the ability to compute the statistical properties of the model (such as closed-form likelihoods, mean, correlation, etc.) and provides an opportunity for additional capacity at the cost of possible overfitting.  Most importantly, the fundamental properties of normalizing flows allow us to generalize recalibration to conditional and multivariate distributions. To aid in detecting miscalibration and measuring our success at fixing it, we use a simple extension of the calibration Q-Q plot.",
+    "title": "Normalizing Flows for Calibration and Recalibration",
+    "authors": [
+      "Achintya Gopal",
+      "Aaron Key"
+    ],
+    "emails": [
+      "~Aaron_Key1",
+      "~Achintya_Gopal1"
+    ],
+    "rank": 2602
+  },
+  {
+    "url": "https://openreview.net/forum?id=nPVlVsBTiJ",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      3,
+      4
+    ],
+    "rating": "4.07",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Machine learning models are vulnerable to adversarial attacks.  One approach to addressing this vulnerability is certification, which focuses on models that are guaranteed to be robust for a given perturbation size.  A drawback of recent certified models is that they are stochastic: they require multiple computationally expensive model evaluations with random noise added to a given image. In our work, we present a deterministic certification approach which results in a certifiably robust model. This approach is based on an equivalence between training with a particular regularized loss, and the expected values of Gaussian averages. We achieve certified models on ImageNet-1k by retraining a model with this loss for one epoch without the use of label information.",
+    "title": "Adversarial Boot Camp: label free certified robustness in one epoch",
+    "authors": [
+      "Ryan Campbell",
+      "Chris Finlay",
+      "Adam M Oberman"
+    ],
+    "emails": [
+      "~Adam_M_Oberman1",
+      "~Chris_Finlay1",
+      "~Ryan_Campbell2"
+    ],
+    "rank": 2603
+  },
+  {
+    "url": "https://openreview.net/forum?id=CxGPf2BPVA",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      5,
+      4
+    ],
+    "rating": "4.07",
+    "confidences": [
+      5,
+      2,
+      4,
+      4
+    ],
+    "abstract": "In most machine learning algorithms, training data is assumed to be independent and identically distributed (iid).  \nWhen it is not the case, the performances of the algorithms are challenged, leading to the famous phenomenon of \\textit{catastrophic forgetting}. Algorithms dealing with it are gathered in the \\textit{Continual Learning} research field. In this paper, we study the \\textit{regularization} based approaches to continual learning and show that those approaches can not learn to discriminate classes from different tasks in an elemental continual benchmark, the class-incremental setting.\nWe make theoretical reasoning to prove this shortcoming and illustrate it with experiments.\nMoreover, we show that it can have some important consequences on multi-tasks reinforcement learning or in pre-trained models used for continual learning.\nWe believe this paper to be the first to propose a theoretical description of regularization shortcomings for continual learning. ",
+    "title": "Regularization Shortcomings for Continual Learning",
+    "authors": [
+      "Timothee LESORT",
+      "Andrei Stoian"
+    ],
+    "emails": [
+      "~Timothee_LESORT1",
+      "thalesgroup.com"
+    ],
+    "rank": 2604
+  },
+  {
+    "url": "https://openreview.net/forum?id=hEnTjYdnZD",
+    "ratings": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "rating": "4.07",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The Rectified Linear Unit (ReLU) is a widely used activation function in deep neural networks, and several works are devoted to designing its variants to improve performance. However, the output is unbounded for most of such functions, which brings severe accuracy degeneration when the full-precision model is quantized. To tackle the problem of unboundedness, Choi etal. (2019)  introduce an activation clipping parameter for the standard ReLU. In this paper, we propose a Bilateral Clipping Parametric Rectified Linear Unit (BCPReLU) as a generalized version of ReLU and some variants of ReLU. Specifically, the trainable slope and truncation parameters for both positive and negative input are introduced in BCPReLU . We theoretically prove that BCPReLU has almost the same expressive ability as the corresponding unbounded one, and establish its convergence in low-bit quantization training. Numerical experiments on a range of popular models and datasets verify its effectiveness, which outperforms the state-of-the-art methods.",
+    "title": "LEARNING BILATERAL CLIPPING PARAMETRIC ACTIVATION FUNCTION FOR LOW-BIT NEURAL NETWORKS",
+    "authors": [
+      "Yunlong Ding",
+      "Rui Wu",
+      "Dirong Chen"
+    ],
+    "emails": [
+      "~Dirong_Chen1",
+      "horizon.ai",
+      "~Yunlong_Ding1"
+    ],
+    "rank": 2605
+  },
+  {
+    "url": "https://openreview.net/forum?id=bGZtz5-Cmkz",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.07",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Learning and optimizing a blackbox function is a common task in Bayesian optimization and experimental design. In real-world scenarios (e.g., tuning hyper-parameters for deep learning models, synthesizing a protein sequence, etc.), these functions tend to be expensive to evaluate and often rely on high-dimensional inputs. While classical Bayesian optimization algorithms struggle in handling the scale and complexity of modern experimental design tasks, recent works attempt to get around this issue by applying neural networks ahead of the Gaussian process to learn a (low-dimensional) latent representation. We show that such learned representation often leads to collisions in the latent space: two points with significantly different observations collide in the learned latent space. Collisions could be regarded as additional noise introduced by the traditional neural network, leading to degraded optimization performance. To address this issue, we propose Collision-Free Latent Space Optimization (CoFLO), which employs a novel regularizer to reduce the collision in the learned latent space and encourage the mapping from the latent space to objective value to be Lipschitz continuous. CoFLO takes in pairs of data points and penalizes those too close in the latent space compared to their target space distance. We provide a rigorous theoretical justification for the regularizer by inspecting the regret of the proposed algorithm. Our empirical results further demonstrate the effectiveness of CoFLO on several synthetic and real-world Bayesian optimization tasks, including a case study for computational cosmic experimental design.",
+    "title": "Learning Collision-free Latent Space for Bayesian Optimization",
+    "authors": [
+      "Fengxue Zhang",
+      "Yair Altas",
+      "Louise Fan",
+      "Kaustubh Vinchure",
+      "Brian Nord",
+      "Yuxin Chen"
+    ],
+    "emails": [
+      "uchicago.edu",
+      "~Fengxue_Zhang1",
+      "~Yuxin_Chen1"
+    ],
+    "rank": 2606
+  },
+  {
+    "url": "https://openreview.net/forum?id=jlVNBPEDynH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      7
+    ],
+    "rating": "4.07",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Although model-based and model-free approaches to learning the control of systems have achieved impressive results on standard benchmarks, generalization to variations in the task are still unsatisfactory. Recent results suggest that generalization of standard architectures improves only after obtaining exhaustive amounts of data.We give evidence that the generalization capabilities are in many cases bottlenecked by the inability to generalize on the combinatorial aspects. Further, we show that for a certain subclass of the MDP framework, this can be alleviated by neuro-algorithmic architectures.\n\nMany control problems require long-term planning that is hard to solve generically with  neural networks alone. We introduce a neuro-algorithmic policy architecture consisting of a neural network and an embedded time-depended shortest path solver. These policies can be trained end-to-end by blackbox differentiation. We show that this type of architecture generalizes well to unseen variations in the environment already after seeing a few examples.\n\n\n<a href=\"https://sites.google.com/view/neuro-algorithmic\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/neuro-algorithmic</a>",
+    "title": "Neuro-algorithmic Policies for Discrete Planning",
+    "authors": [
+      "Marin Vlastelica Pogan\u010di\u0107",
+      "Michal Rolinek",
+      "Georg Martius"
+    ],
+    "emails": [
+      "~Michal_Rolinek2",
+      "~Georg_Martius1",
+      "~Marin_Vlastelica_Pogan\u010di\u01071"
+    ],
+    "rank": 2607
+  },
+  {
+    "url": "https://openreview.net/forum?id=yoem5ud2vb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.07",
+    "confidences": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Animals are able to discover the topological map (graph) of surrounding environment, which will be used for navigation. Inspired by this biological phenomenon, researchers have recently proposed to learn a graph representation for Markov decision process (MDP) and use such graphs for planning in reinforcement learning (RL). However, existing learning-based graph generation methods suffer from many drawbacks. One drawback is that existing methods do not learn an abstraction for graphs, which results in high memory and computation cost. This drawback also makes generated graph non-robust, which degrades the planning performance. Another drawback is that existing methods cannot be used for facilitating exploration which is important in RL. In this paper, we propose a new method, called topological map abstraction (TOMA), for graph generation. TOMA can learn an abstract graph representation for MDP, which costs much less memory and computation cost than existing methods. Furthermore, TOMA can be used for facilitating exploration. In particular, we propose planning to explore, in which TOMA is used to accelerate exploration by guiding the agent towards unexplored states. A novel experience replay module called vertex memory is also proposed to improve exploration performance. Experimental results show that TOMA can outperform existing methods to achieve the state-of-the-art performance.",
+    "title": "TOMA: Topological Map Abstraction for Reinforcement Learning",
+    "authors": [
+      "Zhao Heng Yin",
+      "Wu-Jun Li"
+    ],
+    "emails": [
+      "~Wu-Jun_Li1",
+      "~Zhao_Heng_Yin1"
+    ],
+    "rank": 2608
+  },
+  {
+    "url": "https://openreview.net/forum?id=fpew0ll7wl",
+    "ratings": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "4.07",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Enabling robust intelligence in the wild entails learning systems that offer uninterrupted inference while affording sustained learning from varying amounts of data and supervision. Such ML systems must be able to cope with the openness and variability inherent to the real world. The machine learning community has organically broken down this challenging task into manageable sub tasks such as supervised, few-shot, continual, and self-supervised learning; each affording distinct challenges and a unique set of methods. Notwithstanding this remarkable progress, the simplified and isolated nature of these experimental setups has resulted in methods that excel in their specific settings, but struggle to generalize beyond them. To foster research towards more general ML systems, we present a new learning and evaluation framework - I{N} TH{E} WIL{D} (NED). NED naturally integrates the objectives of previous frameworks while removing many of the overly strong assumptions such as predefined training and test phases, sufficient labeled data for every class, and the closed-world assumption. In NED, a learner faces a stream of data and must make sequential predictions while choosing how to update itself, adapt quickly to novel classes, and deal with changing data distributions; while optimizing for the total amount of compute. We present novel insights from NED that contradict the findings of less realistic or smaller-scale experiments which emphasizes the need to move towards more pragmatic setups. For example, we show that meta-training causes larger networks to overfit in a way that supervised training does not, few-shot methods break down outside of their narrow experimental setting, and self-supervised method MoCo performs significantly worse when the downstream task contains new and old classes. Additionally, we present two new pragmatic methods (Exemplar Tuning and Minimum Distance Thresholding) that significantly outperform all other methods evaluated in NED.",
+    "title": "In the Wild: From ML Models to Pragmatic ML Systems",
+    "authors": [
+      "Matthew Wallingford",
+      "Aditya Kusupati",
+      "Keivan Alizadeh-Vahid",
+      "Aaron Walsman",
+      "Aniruddha Kembhavi",
+      "Ali Farhadi"
+    ],
+    "emails": [
+      "~Aaron_Walsman1",
+      "~Aditya_Kusupati1",
+      "~Keivan_Alizadeh-Vahid1",
+      "cs.washington.edu",
+      "~Aniruddha_Kembhavi1",
+      "~Ali_Farhadi3"
+    ],
+    "rank": 2609
+  },
+  {
+    "url": "https://openreview.net/forum?id=OifRuTHyQU",
+    "ratings": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.07",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "The ability to preserve local geometry of highly nonlinear manifolds in high dimensional spaces and properly unfold them into lower dimensional hyperplanes is the key to the success of manifold computing, nonlinear dimensionality reduction (NLDR) and visualization. This paper proposes a novel method, called  elastic locally isometric smoothness (ELIS), to empower deep neural networks with such an ability. ELIS requires that a desired metric between points should be preserved across layers in order to preserve local geometry; such a smoothness constraint effectively regularizes vector-based transformations to become well-behaved local metric-preserving homeomorphisms. Moreover, ELIS requires that the smoothness should be imposed in a way to render sufficient flexibility for tackling complicated nonlinearity and non-Euclideanity; this is achieved layer-wisely via nonlinearity in both the similarity and activation functions. The ELIS method incorporates a class of suitable nonlinear similarity functions into a two-way divergence loss and uses hyperparameter continuation in finding optimal solutions. Extensive experiments, comparisons, and ablation study demonstrate that ELIS can deliver results not only superior to UMAP and t-SNE for  and visualization but also better than other leading counterparts of manifold and autoencoder learning for NLDR and manifold data generation.",
+    "title": "Deep Manifold Computing and Visualization Using Elastic Locally Isometric Smoothness",
+    "authors": [
+      "Stan Z. Li",
+      "Zelin Zang",
+      "Lirong Wu"
+    ],
+    "emails": [
+      "~Zelin_Zang2",
+      "~Lirong_Wu1",
+      "~Stan_Z._Li2"
+    ],
+    "rank": 2610
+  },
+  {
+    "url": "https://openreview.net/forum?id=0h9cYBqucS6",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      6,
+      4
+    ],
+    "rating": "4.07",
+    "confidences": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "abstract": "Federated learning has been spotlighted as a way to train neural network models using data distributed over multiple clients without a need to share private data. Unfortunately, however, it has been shown that data privacy could not be fully guaranteed as adversaries may be able to extract certain information on local data from the model parameters transmitted during federated learning. A recent solution based on the secure aggregation primitive enables privacy-preserving federated learning, but at the expense of significant extra communication/computational resources. In this paper, we propose communication-computation efficient secure aggregation which reduces the amount of communication/computational resources at least by a factor of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-sop\"><mjx-c class=\"mjx-c221A TEX-S1\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.107em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c6C\"></mjx-c><mjx-c class=\"mjx-c6F\"></mjx-c><mjx-c class=\"mjx-c67\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2061\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"2\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msqrt><mi>n</mi><mrow><mo>/</mo></mrow><mi>log</mi><mo data-mjx-texclass=\"NONE\">\u2061</mo><mi>n</mi></msqrt></math></mjx-assistive-mml></mjx-container> relative to the existing secure solution without sacrificing data privacy, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> is the number of clients. The key idea behind the suggested scheme is to design the topology of the secret-sharing nodes (denoted by the assignment graph <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container>) as sparse random graphs instead of the complete graph corresponding to the existing solution. We first obtain a sufficient condition on <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container> to guarantee reliable and private federated learning. Afterwards, we suggest using the Erd\\H{o}s-R\u00e9nyi graph as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43A TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>G</mi></math></mjx-assistive-mml></mjx-container>, and provide theoretical guarantees on the reliability/privacy of the proposed scheme. Through extensive real-world experiments, we demonstrate that our scheme, using only 50% of the resources required in the conventional scheme, maintains virtually the same levels of reliability and data privacy in practical federated learning systems.",
+    "title": "Communication-Computation Efficient Secure Aggregation for Federated Learning",
+    "authors": [
+      "Beongjun Choi",
+      "Jy-yong Sohn",
+      "Dong-Jun Han",
+      "Jaekyun Moon"
+    ],
+    "emails": [
+      "~Dong-Jun_Han1",
+      "~Jaekyun_Moon2",
+      "~Jy-yong_Sohn1",
+      "kaist.ac.kr"
+    ],
+    "rank": 2611
+  },
+  {
+    "url": "https://openreview.net/forum?id=JUgC3lqn6r2",
+    "ratings": [
+      5,
+      5,
+      5,
+      2
+    ],
+    "rating": "4.06",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Simplicity is the ultimate sophistication. Differentiable Architecture Search (DARTS) has now become one of the mainstream paradigms of neural architecture search. However, it largely suffers from the well-known performance collapse issue. Such aggregation is thought to have overly benefited from the residual structure which accelerates the information flow. To weaken this impact, we propose to inject unbiased random noise to allow fair competition for candidate operations. We name this novel approach as NoisyDARTS. In effect,  a network optimizer should perceive this difficulty at each training step and refrain from overshooting, especially on skip connections. In the long run, since we add no bias to the gradient in terms of expectation, it is still likely to converge to the right solution area. We also prove that the injected noise plays a role in smoothing the loss landscape, which makes the optimization easier.  Compared with the existing work, our method features extreme simplicity and acts as a new strong baseline. \n",
+    "title": "Noisy Differentiable Architecture Search",
+    "authors": [
+      "Xiangxiang Chu",
+      "Bo Zhang"
+    ],
+    "emails": [
+      "~Bo_Zhang7",
+      "~Xiangxiang_Chu1"
+    ],
+    "rank": 2612
+  },
+  {
+    "url": "https://openreview.net/forum?id=rUVFU1oyAoy",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "rating": "4.06",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Continual learning aims to prevent catastrophic forgetting while learning a new task without accessing data of previously learned tasks. \nThe memory for such learning scenarios build a small subset of the data for previous tasks and is used in various ways such as quadratic programming and sample selection. \nCurrent memory-based continual learning algorithms are formulated as a constrained optimization problem and rephrase constraints as a gradient-based approach.\nHowever, previous works have not provided the theoretical proof on convergence to previously learned tasks.\nIn this paper, we propose a theoretical convergence analysis of continual learning based on stochastic gradient descent method.\nOur method, nonconvex continual learning (NCCL), can achieve the same convergence rate when the proposed catastrophic forgetting term is suppressed at each iteration.\nWe also show that memory-based approaches have an inherent problem of overfitting to memory, which degrades the performance on previously learned tasks, namely catastrophic forgetting.\nWe empirically demonstrate that NCCL successfully performs continual learning with episodic memory by scaling learning rates adaptive to mini-batches on several image classification tasks.  ",
+    "title": "Nonconvex Continual Learning with Episodic Memory",
+    "authors": [
+      "Sungyeob Han",
+      "Yeongmo Kim",
+      "Jungwoo Lee"
+    ],
+    "emails": [
+      "~Sungyeob_Han1",
+      "~Yeongmo_Kim1",
+      "~Jungwoo_Lee1"
+    ],
+    "rank": 2613
+  },
+  {
+    "url": "https://openreview.net/forum?id=s4D2nnwCcM",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "rating": "4.06",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Recent years have witnessed a surge of successful applications of machine reading comprehension. Of central importance to the tasks is the availability of massive amount of labeled data, which facilitates the training of large-scale neural networks. However, in many real-world problems, annotated data are expensive to gather not only because of time cost and budget, but also of certain domain-specific restrictions such as privacy for healthcare data. In this regard, we propose an uncertainty-based adaptive learning algorithm for reading comprehension, which interleaves data annotation and model updating to mitigate the demand of labeling. Our key techniques are two-fold: 1) an unsupervised uncertainty-based sampling scheme that queries the labels of the most informative instances with respect to the currently learned model; and 2) an adaptive loss minimization paradigm that simultaneously fits the data and controls the degree of model updating. We demonstrate on the benchmark datasets that 25\\% less labeled samples suffice to guarantee similar, or even improved performance. Our results demonstrate a strong evidence that for label-demanding scenarios, the proposed approach offers a practical guide on data collection and model training. ",
+    "title": "Uncertainty-Based Adaptive Learning for Reading Comprehension",
+    "authors": [
+      "Jing Wang",
+      "Jie Shen",
+      "Xiaofei Ma",
+      "Andrew Arnold"
+    ],
+    "emails": [
+      "amazon.com",
+      "~Xiaofei_Ma1",
+      "~Jie_Shen6",
+      "~Jing_Wang14"
+    ],
+    "rank": 2614
+  },
+  {
+    "url": "https://openreview.net/forum?id=CPfjKI8Yzx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      6,
+      3
+    ],
+    "rating": "4.06",
+    "confidences": [
+      5,
+      5,
+      3,
+      5
+    ],
+    "abstract": "The goal of imitation learning is to mimic expert behavior from demonstrations, without access to an explicit reward signal. A popular class of approaches infers the (unknown) reward function via inverse reinforcement learning (IRL) followed by maximizing this reward function via reinforcement learning (RL). The policies learned via these approaches are however very brittle in practice and deteriorate quickly even with small test-time perturbations due to compounding errors. We propose Imitation with Planning at Test-time (IMPLANT), a new algorithm for imitation learning that utilizes decision-time planning to correct for compounding errors of any base imitation policy. In contrast to existing approaches, we retain both the imitation policy and the rewards model at decision-time, thereby benefiting from the learning signal of the two components. Empirically, we demonstrate that IMPLANT significantly outperforms benchmark imitation learning approaches on standard control environments and excels at zero-shot generalization when subject to challenging perturbations in test-time dynamics.",
+    "title": "Robust Imitation via Decision-Time Planning",
+    "authors": [
+      "Carl Qi",
+      "Pieter Abbeel",
+      "Aditya Grover"
+    ],
+    "emails": [
+      "~Aditya_Grover1",
+      "~Pieter_Abbeel2",
+      "~Carl_Qi1"
+    ],
+    "rank": 2615
+  },
+  {
+    "url": "https://openreview.net/forum?id=IuBLMxWOXMR",
+    "ratings": [
+      6,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.06",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "If the accuracy of depth estimation from a single RGB image could be improved it would be possible to eliminate the need for expensive and bulky depth sensing hardware. The majority of efforts toward this end have been focused on utilizing geometric constraints, image sequences, or stereo image pairs with the help of a deep neural network. In this work, we propose a framework for simultaneous depth estimation from a single image and image focal stacks using depth-from-defocus and depth-from-focus algorithms. The proposed network is able to learn optimal depth mapping from the information contained in the blurring of a single image, generate a simulated image focal stack and all-in-focus image, and train a depth estimator from an image focal stack. As there is no large dataset specifically designed for our problem, we first learned on a synthetic indoor dataset: NYUv2. Then we compare the performance by comparing with other existing methods on DSLR dataset. Finally, we collected our own dataset using a DSLR and further verify on it. Experiments demonstrate that our system is able to provide comparable results compared with other state-of-the-art methods.",
+    "title": "Unsupervised Simultaneous Depth-from-defocus and Depth-from-focus",
+    "authors": [
+      "Yawen Lu",
+      "Guoyu Lu"
+    ],
+    "emails": [
+      "~Yawen_Lu1",
+      "~Guoyu_Lu3"
+    ],
+    "rank": 2616
+  },
+  {
+    "url": "https://openreview.net/forum?id=2aw7TEq5jo",
+    "ratings": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.06",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Optimizing the channel counts for different layers of a convolutional neural net- work (CNN) to achieve better accuracy without increasing the number of floating- point operations (FLOPs) required during the forward pass at test time is known as CNN width optimization. Prior work on width optimization has cast it as a hyperparameter optimization problem, which introduces large computational overhead (e.g., an additional 2\u00d7 FLOPs of standard training). Minimizing this overhead could therefore significantly speed up training. With that in mind, this paper sets out to empirically understand width optimization by sensitivity analysis. Specifically, we consider the following research question: \u201cDo similar training configurations for a width optimization algorithm also share similar optimized widths?\u201d If this in fact is the case, it suggests that one can find a proxy training configuration requiring fewer FLOPs to reduce the width optimization overhead. Scientifically, it also suggests that similar training configurations share common architectural structure, which may be harnessed to build better methods. To this end, we control the training configurations, i.e., network architectures and training data, for three existing width optimization algorithms and find that the optimized widths are largely transferable across settings. Per our analysis, we can achieve up to 320\u00d7 reduction in width optimization overhead without compromising the top-1 accuracy on ImageNet. Our findings not only suggest an efficient way to conduct width optimization, but also highlight that the widths that lead to better accuracy are invariant to various aspects of network architectures and training data.",
+    "title": "Width transfer: on the (in)variance of width optimization",
+    "authors": [
+      "Rudy Chin",
+      "Diana Marculescu",
+      "Ari S. Morcos"
+    ],
+    "emails": [
+      "~Rudy_Chin2",
+      "~Diana_Marculescu4",
+      "~Ari_S._Morcos1"
+    ],
+    "rank": 2617
+  },
+  {
+    "url": "https://openreview.net/forum?id=O1pkU_4yWEt",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.06",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Medical  entity  extraction  (EE)  is  a  standard  procedure  used  as  a  first  stage  inmedical texts processing. Usually Medical EE is a two-step process: named entityrecognition (NER) and named entity normalization (NEN). We propose a novelmethod of doing medical EE from electronic health records (EHR) as a single-step multi-label classification task by fine-tuning a transformer model pretrainedon a large EHR dataset. Our model is trained end-to-end in an distantly supervisedmanner using targets automatically extracted from medical knowledge base.  Weshow that our model learns to generalize for entities that are present frequentlyenough,  achieving human-level classification quality for most frequent entities.Our  work  demonstrates  that  medical  entity  extraction  can  be  done  end-to-endwithout  human  supervision  and  with  human  quality  given  the  availability  of  alarge enough amount of unlabeled EHR and a medical knowledge base.",
+    "title": "Distantly supervised end-to-end medical entity extraction from electronic health records with human-level quality",
+    "authors": [
+      "Alexander Nesterov",
+      "Dmitry Umerenkov"
+    ],
+    "emails": [
+      "~Dmitry_Umerenkov1",
+      "sberbank.ru"
+    ],
+    "rank": 2618
+  },
+  {
+    "url": "https://openreview.net/forum?id=gdBGF7R8ZCJ",
+    "ratings": [
+      4,
+      6,
+      3,
+      3
+    ],
+    "rating": "4.06",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Despite impressive results, deep learning-based technologies also raise severe privacy and environmental concerns induced by the training procedure often conducted in data centers. In response, alternatives to centralized training such as Federated Learning (FL) have emerged. Perhaps unexpectedly, FL in particular is starting to be deployed at a global scale by companies that must adhere to new legal demands and policies originating from governments and the civil society for privacy protection. However, the potential environmental impact related to FL remains unclear and unexplored. This paper offers the first-ever systematic study of the carbon footprint of FL. First, we propose a rigorous model to quantify the carbon footprint, hence facilitating the investigation of the relationship between FL design and carbon emissions. Then, we compare the carbon footprint of FL to traditional centralized learning. We also formalize an early-stage FL optimization problem enabling the community to consider the importance of optimizing the rate of CO<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-n\" noic=\"true\"></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi></mi><mn>2</mn></msub></math></mjx-assistive-mml></mjx-container> emissions jointly to the accuracy of neural networks. Finally, we highlight and connect the reported results to the future challenges and trends in FL to reduce its environmental impact, including algorithms efficiency, hardware capabilities, and stronger industry transparency.",
+    "title": "A first look into the carbon footprint of federated learning",
+    "authors": [
+      "Xinchi Qiu",
+      "Titouan Parcollet",
+      "daniel j Beutel",
+      "Taner Topal",
+      "Akhil Mathur",
+      "Nicholas Donald Lane"
+    ],
+    "emails": [
+      "~Xinchi_Qiu1",
+      "~Akhil_Mathur1",
+      "~Titouan_Parcollet1",
+      "~Nicholas_Donald_Lane1",
+      "adap.com"
+    ],
+    "rank": 2619
+  },
+  {
+    "url": "https://openreview.net/forum?id=gDikr8MVsMF",
+    "ratings": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.06",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We propose a novel framework for value function factorization in multi-agent deep reinforcement learning using graph neural networks (GNNs). In particular, we consider the team of agents as the set of nodes of a complete directed graph, whose edge weights are governed by an attention mechanism. Building upon this underlying graph, we introduce a mixing GNN module, which is responsible for two tasks: i) factorizing the team state-action value function into individual per-agent observation-action value functions, and ii) explicit credit assignment to each agent in terms of fractions of the global team reward. Our approach, which we call GraphMIX, follows the centralized training and decentralized execution paradigm, enabling the agents to make their decisions independently once training is completed. Experimental results on the StarCraft II multi-agent challenge (SMAC) environment demonstrate the superiority of our proposed approach as compared to the state-of-the-art.",
+    "title": "Graph Convolutional Value Decomposition in Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Navid Naderializadeh",
+      "Fan H. Hung",
+      "Sean Soleyman",
+      "Deepak Khosla"
+    ],
+    "emails": [
+      "hrl.com",
+      "~Navid_Naderializadeh1"
+    ],
+    "rank": 2620
+  },
+  {
+    "url": "https://openreview.net/forum?id=fStMpzKkjMT",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      3,
+      5
+    ],
+    "rating": "4.06",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Distributed Deep Learning (DDL) is essential for large-scale Deep Learning (DL) training. Using a sufficiently large batch size is critical to achieving DDL runtime speedup. In a large batch setting, the learning rate must be increased to compensate for the reduced number of parameter updates. However, a large batch size may converge to sharp minima with poor generalization, and a large learning rate may harm convergence. Synchronous Stochastic Gradient Descent (SSGD) is the de facto DDL optimization method. Recently, Decentralized Parallel SGD (DPSGD) has been proven to achieve a similar convergence rate as SGD and to guarantee linear speedup for non-convex optimization problems. While there was anecdotal evidence that DPSGD outperforms SSGD in the large-batch setting, no systematic study has been conducted to explain why this is the case. Based on a detailed analysis of the DPSGD learning dynamics, we find that DPSGD introduces additional landscape-dependent noise, which has two benefits in the large-batch setting: 1) it automatically adjusts the learning rate to improve convergence; 2) it enhances weight space search by escaping local traps (e.g., saddle points) to find flat minima with better generalization. We conduct extensive studies over 12 state-of-the-art DL models/tasks and demonstrate that DPSGD consistently outperforms SSGD in the large batch setting; \n  and DPSGD converges in cases where SSGD diverges for large learning rates. Our findings are consistent across different application domains, Computer Vision and Automatic Speech Recognition, and different neural network models, Convolutional Neural Networks and Long Short-Term Memory Recurrent Neural Networks.",
+    "title": "Why Does Decentralized Training Outperform Synchronous Training In The Large Batch Setting?",
+    "authors": [
+      "Wei Zhang",
+      "Mingrui Liu",
+      "Yu Feng",
+      "Brian Kingsbury",
+      "Yuhai Tu"
+    ],
+    "emails": [
+      "~Brian_Kingsbury1",
+      "~Yu_Feng3",
+      "~Mingrui_Liu2",
+      "~Yuhai_Tu1",
+      "~Wei_Zhang33"
+    ],
+    "rank": 2621
+  },
+  {
+    "url": "https://openreview.net/forum?id=tij5dHg5Hk",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      3,
+      5
+    ],
+    "rating": "4.06",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Recently, a newly proposed self-supervised framework Bootstrap Your Own Latent (BYOL) seriously challenges the necessity of negative samples in contrastive-based learning frameworks. BYOL works like a charm despite the fact that it discards the negative samples completely and there is no measure to prevent collapse in its training objective. In this paper, we suggest understanding BYOL from the view of our newly proposed interpretable self-supervised learning framework, Run Away From your Teacher (RAFT). RAFT optimizes two objectives at the same time: (i) aligning two views of the same data to similar representations and (ii) running away from the model's Mean Teacher (MT, the exponential moving average of the history models) instead of BYOL's running towards it. The second term of RAFT explicitly prevents the representation collapse and thus makes RAFT a more conceptually reliable framework. We provide basic benchmarks of RAFT on CIFAR10 to validate the effectiveness of our method. Furthermore, we prove that BYOL is equivalent to RAFT under certain conditions, providing solid reasoning for BYOL's counter-intuitive success.",
+    "title": "Run Away From your Teacher: a New Self-Supervised Approach Solving the Puzzle of BYOL",
+    "authors": [
+      "Haizhou Shi",
+      "Dongliang Luo",
+      "Siliang Tang",
+      "Jian Wang",
+      "Yueting Zhuang"
+    ],
+    "emails": [
+      "~Yueting_Zhuang1",
+      "~Jian_Wang14",
+      "~Haizhou_Shi1",
+      "~Siliang_Tang1",
+      "~Dongliang_Luo1"
+    ],
+    "rank": 2622
+  },
+  {
+    "url": "https://openreview.net/forum?id=wU1yJ-n4et",
+    "ratings": [
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "Semi-supervised domain adaptation (SSDA) is to adapt a learner to a new domain with only a small set of labeled samples when a large labeled dataset is given on a source domain. In this paper, we propose a pair-based SSDA method that adapts a learner to the target domain using self-distillation with sample pairs. Our method composes the sample pair by selecting a teacher sample from a labeled dataset (i.e., source or labeled target) and its student sample from an unlabeled dataset (i.e., unlabeled target), and then minimizes the output discrepancy between the two samples. We assign a reliable student to a teacher using pseudo-labeling and reliability evaluation so that the teacher sample propagates its prediction to the corresponding student sample. When the teacher sample is chosen from the source dataset, it minimizes the discrepancy between the source domain and the target domain. When the teacher sample is selected from the labeled target dataset, it reduces the discrepancy within the target domain. Experimental evaluation on standard benchmarks shows that our method effectively minimizes both the inter-domain and intra-domain discrepancies, thus achieving the state-of-the-art results.",
+    "title": "Pair-based Self-Distillation for Semi-supervised Domain Adaptation",
+    "authors": [
+      "Jeongbeen Yoon",
+      "Dahyun Kang",
+      "Minsu Cho"
+    ],
+    "emails": [
+      "~Jeongbeen_Yoon1",
+      "~Minsu_Cho1",
+      "~Dahyun_Kang1"
+    ],
+    "rank": 2623
+  },
+  {
+    "url": "https://openreview.net/forum?id=uxYjVEXx48i",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Treatment recommendation is a complex multi-faceted problem with many conflicting objectives, e.g., optimizing the survival rate (or expected lifetime), mitigating negative impacts, reducing financial expenses and time costs, avoiding over-treatment, etc. While this complicates the hand-engineering of a reward function for learning treatment policies, fortunately, qualitative feedback from human experts is readily available and can be easily exploited.  Since direct estimation of rewards via inverse reinforcement learning is a challenging task and requires the existence of an optimal human policy, the field of treatment recommendation has recently witnessed the development of the preference-based ReinforcementLearning (PRL) framework, which infers a reward function from only qualitative and imperfect human feedback to ensure that a human expert\u2019s preferred policy has a higher expected return over a less preferred policy. In this paper, we first present an open simulation platform to model the progression of two diseases, namely Cancer and Sepsis, and the reactions of the affected individuals to the received treatment. Secondly, we investigate important problems in adopting preference-basedRL approaches for treatment recommendation, such as advantages of learning from preference over hand-engineered reward, addressing incomparable policies, reward interpretability, and agent design via simulated experiments. The designed simulation platform and insights obtained for preference-based RL approaches are beneficial for achieving the right trade-off between various human objectives during treatment recommendation.",
+    "title": "An Examination of Preference-based Reinforcement Learning for Treatment Recommendation",
+    "authors": [
+      "Nan Xu",
+      "Nitin Kamra",
+      "Yan Liu"
+    ],
+    "emails": [
+      "~Nitin_Kamra1",
+      "~Yan_Liu1",
+      "~Nan_Xu2"
+    ],
+    "rank": 2624
+  },
+  {
+    "url": "https://openreview.net/forum?id=jcpcUjw7Kzz",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Discrete representations have been key in enabling robots to plan at more abstract levels and solve temporally-extended tasks more efficiently for decades. However, they typically require expert specifications. On the other hand, deep reinforcement learning aims to learn to solve tasks end-to-end, but struggles with long-horizon tasks. In this work, we propose Discrete Object-factorized Representation Planning (DORP), which learns temporally-abstracted discrete representations from exploratory video data in an unsupervised fashion via a mutual information maximization objective. DORP plans a sequence of abstract states for a low-level model-predictive controller to follow. In our experiments, we show that DORP robustly solves unseen long-horizon tasks. Interestingly, it discovers independent representations per object and binary properties such as a key-and-door. \n",
+    "title": "Discrete Predictive Representation for Long-horizon Planning",
+    "authors": [
+      "Thanard Kurutach",
+      "Julia Peng",
+      "Yang Gao",
+      "Stuart Russell",
+      "Pieter Abbeel"
+    ],
+    "emails": [
+      "~Thanard_Kurutach1",
+      "~Pieter_Abbeel2",
+      "~Stuart_Russell1",
+      "~Yang_Gao1",
+      "~Julia_Peng1"
+    ],
+    "rank": 2625
+  },
+  {
+    "url": "https://openreview.net/forum?id=N07ebsD-lHp",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "While machine learning models today can achieve high accuracies on classification tasks, they can be deceived by minor imperceptible distortions to the data. These are known as adversarial attacks and can be lethal in the black-box setting which does not require knowledge of the target model type or its parameters. Binary neural networks that have sign activation and are trained with gradient descent have been shown to be harder to attack than conventional sigmoid activation networks but their improvements are marginal. We instead train sign activation networks with a novel gradient-free stochastic coordinate descent algorithm and propose an ensemble of such networks as a defense model. We evaluate the robustness of our model (a hard problem in itself) on image, text, and medical ECG data and find it to be more robust than ensembles of binary, full precision, and convolutional neural networks, and than random forests while attaining comparable clean test accuracy. In order to explain our model's robustness we show that an adversary targeting a single network in our ensemble fails to attack (and thus non-transferable to) other networks in the ensemble. Thus a datapoint requires a large distortion to fool the majority of networks in our ensemble and is likely to be detected in advance. This property of non-transferability arises naturally from the non-convexity of sign activation networks and randomization in our gradient-free training algorithm without any adversarial defense effort.",
+    "title": "Defending against black-box adversarial attacks with gradient-free trained sign activation neural networks",
+    "authors": [
+      "Yunzhe Xue",
+      "Meiyan Xie",
+      "Zhibo Yang",
+      "Usman Roshan"
+    ],
+    "emails": [
+      "njit.edu",
+      "~Usman_Roshan1"
+    ],
+    "rank": 2626
+  },
+  {
+    "url": "https://openreview.net/forum?id=ddpyTq6B6TU",
+    "ratings": [
+      6,
+      3,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Although pre-trained contextualized language models such as BERT achieve significant performance on various downstream tasks, current language representation still only focuses on linguistic objective at a specific granularity, which is not applicable when multiple levels of linguistic units are involved at the same time. Thus we present a universal representation model, BURT (BERT-inspired Universal Representation from learning meaningful segmenT), to encode different levels of linguistic unit into the same vector space. Specifically, we extract and mask meaningful segments based on point-wise mutual information (PMI) to incorporate different granular objectives into the pre-training stage. Our model surpasses BERT and BERT-wwm-ext on a wide range of downstream tasks in the ChineseGLUE (CLUE) benchmark. Especially, BURT-wwm-ext obtains 74.48% on the WSC test set, 3.45% point absolute improvement compared with its baseline model. We further verify the effectiveness of our unified pre-training strategy in two real-world text matching scenarios. As a result, our model significantly outperforms existing information retrieval (IR) methods and yields universal representations that can be directly applied to retrieval-based question-answering and natural language generation tasks. ",
+    "title": "BURT: BERT-inspired Universal Representation from Learning Meaningful Segment",
+    "authors": [
+      "Yian Li",
+      "hai zhao"
+    ],
+    "emails": [
+      "~Yian_Li1",
+      "~hai_zhao1"
+    ],
+    "rank": 2627
+  },
+  {
+    "url": "https://openreview.net/forum?id=thhdrl4IdMm",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      4,
+      4,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The last decade has witnessed a boom of deep learning research and applications achieving state-of-the-art results in various domains. However, most advances have been established empirically, and their theoretical analysis remains lacking. One major issue is that our current interpretation of neural networks (NNs) as function approximators is too generic to support in-depth analysis. In this paper, we remedy this by proposing an alternative interpretation that identifies NNs as chain graphs (CGs) and feed-forward as an approximate inference procedure. The CG interpretation specifies the nature of each NN component within the rich theoretical framework of probabilistic graphical models, while at the same time remains general enough to cover real-world NNs with arbitrary depth, multi-branching and varied activations, as well as common structures including convolution / recurrent layers, residual block and dropout. We demonstrate with concrete examples that the CG interpretation can provide novel theoretical support and insights for various NN techniques, as well as derive new deep learning approaches such as the concept of partially collapsed feed-forward inference. It is thus a promising framework that deepens our understanding of neural networks and provides a coherent theoretical formulation for future deep learning research.",
+    "title": "A Chain Graph Interpretation of Real-World Neural Networks",
+    "authors": [
+      "Yuesong Shen",
+      "Daniel Cremers"
+    ],
+    "emails": [
+      "~Daniel_Cremers1",
+      "~Yuesong_Shen1"
+    ],
+    "rank": 2628
+  },
+  {
+    "url": "https://openreview.net/forum?id=6htjOqus6C3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "This paper challenges the common assumption that the weight <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>, in <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE, should be larger than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn></math></mjx-assistive-mml></mjx-container> in order to effectively disentangle latent factors. We demonstrate that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE, with <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi><mo>&lt;</mo><mn>1</mn></math></mjx-assistive-mml></mjx-container>, can not only attain good disentanglement but also significantly improve reconstruction accuracy via dynamic control. The paper \\textit{removes the inherent trade-off} between reconstruction accuracy and disentanglement for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE. Existing methods, such as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE and FactorVAE, assign a large weight to the KL-divergence term in the objective function, leading to high reconstruction errors for the sake of better disentanglement. To mitigate this problem, a ControlVAE has recently been developed that dynamically tunes the KL-divergence weight in an attempt to \\textit{control the trade-off} to more a favorable point. However, ControlVAE fails to eliminate the conflict between the need for a large <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> (for disentanglement) and the need for a small <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> (for smaller reconstruction error). Instead, we propose DynamicVAE that maintains a different <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container> at different stages of training, thereby \\textit{decoupling disentanglement and reconstruction accuracy}. In order to evolve the weight, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>, along a trajectory that enables such decoupling, DynamicVAE leverages a modified incremental PI (proportional-integral) controller, a variant of proportional-integral-derivative controller (PID) algorithm, and employs a moving average as well as a hybrid annealing method to evolve the value of KL-divergence smoothly in a tightly controlled fashion. We theoretically prove the stability of the proposed approach. Evaluation results on three benchmark datasets demonstrate that DynamicVAE significantly improves the reconstruction accuracy while achieving disentanglement comparable to the best of existing methods. The results verify that our method can separate disentangled representation learning and reconstruction, removing the inherent tension between the two. ",
+    "title": "DynamicVAE: Decoupling Reconstruction Error and Disentangled Representation Learning",
+    "authors": [
+      "Huajie Shao",
+      "Haohong Lin",
+      "Qinmin Yang",
+      "Shuochao Yao",
+      "Han Zhao",
+      "Tarek Abdelzaher"
+    ],
+    "emails": [
+      "~Han_Zhao1",
+      "~Huajie_Shao1",
+      "~Shuochao_Yao1",
+      "zju.edu.cn",
+      "~Tarek_Abdelzaher1"
+    ],
+    "rank": 2629
+  },
+  {
+    "url": "https://openreview.net/forum?id=MqWHrrIfBMN",
+    "ratings": [
+      4,
+      3,
+      8
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      1
+    ],
+    "abstract": "With the increasing demand for more and more data, the federated learning (FL) methods, which try to utilize highly distributed on-device local data in the training process, have been proposed.However, fledgling services provided by startup companies not only have limited number of clients, but also have minimal resources for constant communications between the server and multiple clients. In addition, in a real-world environment where the user pool changes dynamically, the FL system must be able to efficiently utilize rapid inflow and outflow of users, while at the same time experience minimal bottleneck due to network delays of multiple users. In this respect, we amend the federated learning scenario to a more flexible asynchronous edge learning. To solve the aforementioned learning problems, we propose an asynchronous model-based communication method with knowledge distillation. In particular, we dub our knowledge distillation scheme as ``\"cloned distillation\" and explain how it is different from other knowledge distillation method. In brief, we found that in knowledge distillation between the teacher and the student there exist two contesting traits in the student: to attend to the teacher's knowledge or to retain its own knowledge exclusive to the teacher. And in this edge learning scenario, the attending property should be amplified rather than the retaining property, because teachers are dispatched to the users to learn from them and recollected at the server to teach the core model. Our asynchronous edge learning method can elastically handle the dynamic inflow and outflow of users in a  service with minimal communication cost, operate with essentially no bottleneck due to user delay, and protect user's privacy. Also we found that it is robust to users who behave abnormally or maliciously.",
+    "title": "Asynchronous Edge Learning using Cloned Knowledge Distillation",
+    "authors": [
+      "Sangho Lee",
+      "KiYoon Yoo",
+      "Nojun Kwak"
+    ],
+    "emails": [
+      "~Nojun_Kwak1",
+      "snu.ac.kr",
+      "~Sangho_Lee3"
+    ],
+    "rank": 2630
+  },
+  {
+    "url": "https://openreview.net/forum?id=1WF-fPvY_jQ",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Though the pre-trained contextualized language model (PrLM) has made a significant impact on NLP, training PrLMs in languages other than English can be impractical for two reasons: other languages often lack corpora sufficient for training powerful PrLMs, and because of the commonalities among human languages, computationally expensive PrLM training for different languages is somewhat redundant.\nIn this work, building upon the recent works connecting cross-lingual transfer learning and neural machine translation, we thus propose a novel cross-lingual transfer learning framework for PrLMs: \\textsc{TreLM}. \nTo handle the symbol order and sequence length differences between languages, we propose an intermediate ``TRILayer\" structure that learns from these differences and creates a better transfer in our primary translation direction, as well as a new cross-lingual language modeling objective for transfer training. \nAdditionally, we showcase an embedding aligning that adversarially adapts a PrLM's non-contextualized embedding space and the TRILayer structure to learn a text transformation network across languages, which addresses the vocabulary difference between languages. \nExperiments on both language understanding and structure parsing tasks show the proposed framework significantly outperforms language models trained from scratch with limited data in both performance and efficiency. \nMoreover, despite an insignificant performance loss compared to pre-training from scratch in resource-rich scenarios, our transfer learning framework is significantly more economical.",
+    "title": "Cross-lingual Transfer Learning for Pre-trained Contextualized Language Models",
+    "authors": [
+      "Zuchao Li",
+      "Kevin Barry Parnow",
+      "hai zhao",
+      "Zhuosheng Zhang",
+      "Rui Wang",
+      "Masao Utiyama",
+      "Eiichiro Sumita"
+    ],
+    "emails": [
+      "~hai_zhao1",
+      "~Rui_Wang10",
+      "~Zuchao_Li1",
+      "~Kevin_Barry_Parnow1",
+      "~Zhuosheng_Zhang1",
+      "~Eiichiro_Sumita1",
+      "~Masao_Utiyama2"
+    ],
+    "rank": 2631
+  },
+  {
+    "url": "https://openreview.net/forum?id=hUAmiQCeUGm",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Extreme multi-label learning (XML) works to annotate objects with relevant labels from an extremely large label set. Many previous methods treat labels uniformly such that the learned model tends to perform better on head labels, while the performance is severely deteriorated for tail labels. However, it is often desirable to predict more tail labels in many real-world applications. To alleviate this problem, in this work, we show theoretical and experimental evidence for the inferior performance of representative XML methods on tail labels. Our finding is that the norm of label classifier weights typically follows a long-tailed distribution similar to the label frequency, which results in the over-suppression of tail labels. Base on this new finding, we present two new modules: (1)~\\algoa~learns to re-rank the predictions by optimizing a population-aware loss, which predicts tail labels with high rank; (2)~\\algob~augments tail labels via a decoupled learning scheme, which can yield more balanced classification boundary. We conduct experiments on commonly used XML benchmarks with hundreds of thousands of labels, showing that the proposed methods improve the performance of many state-of-the-art XML models by a considerable margin (6\\% performance gain with respect to PSP@1 on average).",
+    "title": "Improving Tail Label Prediction for Extreme Multi-label Learning",
+    "authors": [
+      "Tong Wei",
+      "Wei-Wei Tu",
+      "Yu-Feng Li"
+    ],
+    "emails": [
+      "~Wei-Wei_Tu1",
+      "~Tong_Wei1",
+      "~Yu-Feng_Li1"
+    ],
+    "rank": 2632
+  },
+  {
+    "url": "https://openreview.net/forum?id=XZzriKGEj0_",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      6,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "abstract": "Gaussian Process (GP) regression fits a curve on a set of datapairs, with each pair consisting of an input point '<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi mathvariant=\"bold\">x</mi></mrow></math></mjx-assistive-mml></mjx-container>' and its corresponding target regression value '<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>y</mi><mo stretchy=\"false\">(</mo><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>' (a positive datapair). But, what if for an input point '<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.053em; margin-bottom: -0.544em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cAF\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">\u00af</mo></mover></mrow></math></mjx-assistive-mml></mjx-container>', we want to constrain the GP to avoid a target regression value '<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.056em; margin-bottom: -0.544em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cAF\"></mjx-c></mjx-mo></mjx-over><mjx-base style=\"padding-left: 0.005em;\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.053em; margin-bottom: -0.544em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cAF\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b\"><mjx-c class=\"mjx-c1D431 TEX-B\"></mjx-c></mjx-mi></mjx-texatom></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>y</mi><mo stretchy=\"false\">\u00af</mo></mover></mrow><mo stretchy=\"false\">(</mo><mrow><mover><mrow><mi mathvariant=\"bold\">x</mi></mrow><mo stretchy=\"false\">\u00af</mo></mover></mrow><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>' (a negative datapair)? This requirement can often appear in real-world navigation tasks, where an agent would want to avoid obstacles, like furniture items in a room when planning a trajectory to navigate. In this work, we propose to incorporate such negative constraints in a GP regression framework. Our approach, 'GP-NC' or Gaussian Process with Negative Constraints, fits over the positive datapairs while avoiding the negative datapairs. Specifically, our key idea is to model the negative datapairs using small blobs of Gaussian distribution and maximize its KL divergence from the GP. We jointly optimize the GP-NC for both the positive and negative datapairs. We empirically demonstrate that our GP-NC framework performs better than the traditional GP learning and that our framework does not affect the scalability of Gaussian Process regression and helps the model converge faster as the size of the data increases.",
+    "title": "Learning What Not to Model: Gaussian Process Regression with Negative Constraints",
+    "authors": [
+      "Gaurav Shrivastava",
+      "Harsh Shrivastava",
+      "Abhinav Shrivastava"
+    ],
+    "emails": [
+      "~Abhinav_Shrivastava2",
+      "~Harsh_Shrivastava1",
+      "~Gaurav_Shrivastava1"
+    ],
+    "rank": 2633
+  },
+  {
+    "url": "https://openreview.net/forum?id=qJIvFn8sOs",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      2,
+      3
+    ],
+    "abstract": "Learning disentangled representations of textual data is essential for many natural language tasks such as fair classification (\\textit{e.g.}  building classifiers whose decisions cannot disproportionately hurt or benefit specific groups identified by sensitive attributes), style transfer and sentence generation, among others. The existent dominant approaches in the context of text data have been  based on training an adversary (discriminator or teacher) that aims at making  attribute values difficult to be   inferred from the latent code. Although these approaches are remarkably simple and even though the adversary seems to be performing perfectly  during the training phase, after training is completed a fair amount of sensitive information to infer the attribute still remains. This paper investigates learning to disentangle representations by minimizing a novel variational (upper) bound of the mutual information between an identified attribute and the latent code of a deep neural network encoder. We demonstrate that our surrogate leads to better disentangled representations on both fair classification and sentence generation tasks while not suffering from the degeneracy of adversarial losses in multi-class scenarios. Furthermore, by optimizing the trade-off between the level of disentanglement and quality of the generated sentences for polarity transfer and sentence generation tasks, we provide some lights to the well-known debate on whether or not \\textit{``disentangled representations may be helpful for polarity transfer and sentence generation purposes''}.",
+    "title": "Learning to Disentangle Textual Representations and Attributes via Mutual Information",
+    "authors": [
+      "Pierre Colombo",
+      "Chlo\u00e9 Clavel",
+      "Pablo Piantanida"
+    ],
+    "emails": [
+      "~Pierre_Colombo2",
+      "~Chlo\u00e9_Clavel1",
+      "~Pablo_Piantanida1"
+    ],
+    "rank": 2634
+  },
+  {
+    "url": "https://openreview.net/forum?id=d9Emve8gG5E",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      2
+    ],
+    "abstract": "Lately, personalized marketing has become important for retail/e-retail firms due to significant rise in online shopping and market competition. Increase in online shopping and high market competition has led to an increase in promotional expenditure for online retailers, and hence, rolling out optimal offers has become imperative to maintain balance between number of transactions and profit. In this paper, we propose our approach to solve the offer optimization problem at the intersection of consumer, item and time in retail setting. To optimize offer, we first build a generalized non-linear model using Temporal Convolutional Network to predict the item purchase probability at consumer level for the given time period. Secondly, we establish the functional relationship between historical offer values and purchase probabilities obtained from the model, which is then used to estimate offer-elasticity of purchase probability at consumer item granularity. Finally, using estimated elasticities, we optimize offer values using constraint based optimization technique. This paper describes our detailed methodology and presents the results of modelling and optimization across categories.",
+    "title": "OFFER PERSONALIZATION USING TEMPORAL CONVOLUTION NETWORK AND OPTIMIZATION",
+    "authors": [
+      "Ankur Verma"
+    ],
+    "emails": [
+      "~Ankur_Verma1"
+    ],
+    "rank": 2635
+  },
+  {
+    "url": "https://openreview.net/forum?id=TUFwWlAL8r",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Learning from past mistakes is a quintessential aspect of intelligence. In sequential decision-making, existing meta-learning methods that learn a learning algorithm utilize experience from only a few previous episodes to adapt their policy to new environments and tasks. Such methods must learn to correct their mistakes from highly-correlated sequences of states and actions generated by the same policy's consequent roll-outs during training. Learning from correlated data is known to be problematic and can significantly impact the quality of the learned correction mechanism. We show that this problem can be mitigated by augmenting current systems with an external memory bank that stores a larger and more diverse set of past experiences. Detailed experiments demonstrate that our method outperforms existing meta-learning algorithms on a suite of challenging tasks from raw visual observations.",
+    "title": "Learning to Recover from Failures using Memory",
+    "authors": [
+      "Tao Chen",
+      "Pulkit Agrawal"
+    ],
+    "emails": [
+      "~Pulkit_Agrawal1",
+      "~Tao_Chen1"
+    ],
+    "rank": 2636
+  },
+  {
+    "url": "https://openreview.net/forum?id=kV-KqNefqh",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "In the areas of machine learning and computer vision, text-to-image synthesis aims at producing image outputs given the input text. In particular, the task of layout generation requires one to describe the spatial information for each object component, with the ability to model their relationships. In this paper, we present a LayoutTransformer Network (LT-Net), which is a generative model for text-conditioned layout generation. By extracting semantics-aware yet object discriminative contextual features from the input, we utilize Gaussian mixture models to describe the layouts for each object with relation consistency enforced. Finally, a co-attention mechanism across textual and visual features is deployed to produce the final output. In our experiments, we conduct extensive experiments on both MS-COCO and Visual Genome (VG) datasets, and confirm the effectiveness and superiority of our LT-Net over recent text-to-image and layout generation models.",
+    "title": "LayoutTransformer: Relation-Aware Scene Layout Generation",
+    "authors": [
+      "Cheng-Fu Yang",
+      "Wan-Cyuan Fan",
+      "Fu-En Yang",
+      "Yu-Chiang Frank Wang"
+    ],
+    "emails": [
+      "~Wan-Cyuan_Fan1",
+      "~Cheng-Fu_Yang1",
+      "~Fu-En_Yang1",
+      "~Yu-Chiang_Frank_Wang2"
+    ],
+    "rank": 2637
+  },
+  {
+    "url": "https://openreview.net/forum?id=bWqodw-mFi1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The typical contrastive self-supervised algorithm uses a similarity measure in latent space as the supervision signal by contrasting positive and negative images directly or indirectly. Although the utility of self-supervised algorithms has improved recently, there are still bottlenecks hindering their widespread use, such as the compute needed. In this paper, we propose a module that serves as an additional objective in the self-supervised contrastive learning paradigm. We show how the inclusion of this module to regress the parameters of an affine transformation or homography, in addition to the original contrastive objective, improves both performance and rate of learning. Importantly, we ensure that this module does not enforce invariance to the various components of the affine transform, as this is not always ideal. We demonstrate the effectiveness of the additional objective on two recent, popular self-supervised algorithms. We perform an extensive experimental analysis of the proposed method and show an improvement in performance for all considered datasets. Further, we find that although both the general homography and affine transformation are sufficient to improve performance and convergence, the affine transformation performs better in all cases.",
+    "title": "Explicit homography estimation improves contrastive self-supervised learning",
+    "authors": [
+      "David Torpey",
+      "Richard Klein"
+    ],
+    "emails": [
+      "~David_Torpey1",
+      "~Richard_Klein1"
+    ],
+    "rank": 2638
+  },
+  {
+    "url": "https://openreview.net/forum?id=DegtqJSbxo",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "In this paper we aim to explore the general robustness of neural network classifiers by utilizing adversarial as well as natural perturbations. Different from previous works which mainly focus on studying the robustness of neural networks against adversarial perturbations, we also evaluate their robustness on natural perturbations before and after robustification. After standardizing the comparison between adversarial and natural perturbations, we demonstrate that although adversarial training improves the performance of the networks against adversarial perturbations, it leads to drop in the performance for naturally perturbed samples besides clean samples. In contrast, natural perturbations like elastic deformations, occlusions and wave does not only improve the performance against natural perturbations, but also lead to improvement in the performance for the adversarial perturbations. Additionally they do not drop the accuracy on the clean images.",
+    "title": "Adversarial and Natural Perturbations for General Robustness",
+    "authors": [
+      "Sadaf Gulshad",
+      "Jan Hendrik Metzen",
+      "Arnold W.M. Smeulders"
+    ],
+    "emails": [
+      "~Jan_Hendrik_Metzen1",
+      "~Arnold_W.M._Smeulders1",
+      "~Sadaf_Gulshad1"
+    ],
+    "rank": 2639
+  },
+  {
+    "url": "https://openreview.net/forum?id=AhLeNin_5sh",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Anomaly Detection(AD) for multivariate time series is an active area in machine learning, with critical applications in Information Technology system management, Spacecraft Health monitoring, Multi-Robot Systems detection, etc.. However, due to complex correlations and various temporal patterns of large-scale multivariate time series, a general unsupervised anomaly detection model with higher F1-score and Timeliness remains a challenging task. In this paper, We propose a General representations of multivariate time series for Anomaly Detection(GenAD). First, we apply Time-Series Attention to represent the various temporal patterns of each time series. Second, we employ Multi-Correlation Attention to represent the complex correlations of multivariate time series. With the above innovations, GenAD improves F1-scores of AD by 0.3% to 5% over state-of-the-art model in public datasets, while detecting anomalies more rapidly in anomaly segments. Moreover, we propose a general pre-training algorithm on large-scale multivariate time series, which can be easily transferred to a specific AD tasks with only a few fine-tuning steps. Extensive experiments show that GenAD is able to outperform state-of-the-art model with only 10% of the training data. ",
+    "title": "GenAD: General Representations of Multivariate Time Series for Anomaly Detection",
+    "authors": [
+      "Xiaolei Hua",
+      "Su Wang",
+      "Lin Zhu",
+      "Dong Zhou",
+      "Junlan Feng",
+      "Yiting Wang",
+      "Chao Deng",
+      "Shuo Wang",
+      "Mingtao Mei"
+    ],
+    "emails": [
+      "~Chao_Deng3",
+      "~Mingtao_Mei1",
+      "~Dong_Zhou1",
+      "~Junlan_Feng2",
+      "~Su_Wang1",
+      "~Shuo_Wang12",
+      "~Lin_Zhu6",
+      "~Yiting_Wang1",
+      "~Xiaolei_Hua1"
+    ],
+    "rank": 2640
+  },
+  {
+    "url": "https://openreview.net/forum?id=q8mp_buclp",
+    "ratings": [
+      4,
+      5,
+      5,
+      2
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Program source code contains complex structure information, which can be represented in structured data forms like trees or graphs. To acquire the structural information in source code, most existing researches use abstract syntax trees (AST). A group of works add additional edges to ASTs to convert source code into graphs and use graph neural networks to learn representations for program graphs. Although these works provide additional control or data flow information to ASTs for downstream tasks, they neglect an important aspect of structure information in AST itself: the different types of nodes and edges. In ASTs, different nodes contain different kinds of information like variables or control flow, and the relation between a node and all its children can also be different.\n\nTo address the information of node and edge types, we bring the idea of heterogeneous graph mining to learning on source code and present a new formula of building heterogeneous program graphs from ASTs with additional type information for nodes and edges. We use the ASDL grammar of programming language to define the node and edge types of program graphs. Then we use heterogeneous graph neural networks to learn on these graphs. We evaluate our approach on two tasks: code comment generation and method naming. Both tasks require reasoning on the semantics of complete code snippets. Experiment results show that our approach outperforms baseline models, including homogeneous graph-based models, showing that leveraging the type information of nodes and edges in program graphs can help in learning program semantics.",
+    "title": "Learning to Represent Programs with Heterogeneous Graphs",
+    "authors": [
+      "Wenhan Wang",
+      "Kechi Zhang",
+      "Ge Li",
+      "Zhi Jin"
+    ],
+    "emails": [
+      "pku.edu.cn",
+      "~Zhi_Jin1",
+      "~Wenhan_Wang2"
+    ],
+    "rank": 2641
+  },
+  {
+    "url": "https://openreview.net/forum?id=0DALDI-xyW4",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Nesterov's accelerated method are widely used in problems with machine learning background including deep learning. To give more insight about the acceleration phenomenon, an ordinary differential equation was obtained from Nesterov's accelerated method by taking step sizes approaching zero, and the relationship between Nesterov's method and the differential equation is still of research interest. In this work, we give the precise order of the iterations of Nesterov's accelerated method converging to the solution of derived differential equation as step sizes go to zero. We then present a new accelerated method with higher order. The new method is more stable than ordinary method for large step size and converges faster. We further apply the new method to matrix completion problem and show its better performance through numerical experiments.",
+    "title": "A new accelerated gradient method inspired by continuous-time perspective",
+    "authors": [
+      "Yasong Feng",
+      "Weiguo Gao"
+    ],
+    "emails": [
+      "fudan.edu.cn",
+      "~Yasong_Feng1"
+    ],
+    "rank": 2642
+  },
+  {
+    "url": "https://openreview.net/forum?id=YfjCEVpolsz",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Macro actions have been demonstrated to be beneficial for the learning processes of an agent. A variety of techniques have been developed to construct more effective macro actions. However, they usually fail to provide an approach for combining macro actions to form a synergistic macro action ensemble. A synergistic macro action ensemble performs better than individual macro actions within it. Motivated by the recent advances of neural architecture search, we formulate the construction of a synergistic macro action ensemble as a sequential decision problem and evaluate the ensemble in a task. The formulation of sequential decision problem enables coherency in the macro actions to be considered. Also, our evaluation procedure takes synergism into account since the synergism among the macro action ensemble exhibits when jointly used by an agent. The experimental results show that our framework is able to discover synergistic macro action ensembles. We further perform experiments to validate the synergism property among the macro actions in an ensemble.",
+    "title": "Toward Synergism in Macro Action Ensembles",
+    "authors": [
+      "Yu Ming Chen",
+      "Kuan-Yu Chang",
+      "Chien Liu",
+      "Tsu-Ching Hsiao",
+      "Zhang-Wei Hong",
+      "Chun-Yi Lee"
+    ],
+    "emails": [
+      "~Yu_Ming_Chen1",
+      "~Zhang-Wei_Hong1",
+      "~Kuan-Yu_Chang1",
+      "~Chien_Liu1",
+      "~Tsu-Ching_Hsiao1",
+      "~Chun-Yi_Lee1"
+    ],
+    "rank": 2643
+  },
+  {
+    "url": "https://openreview.net/forum?id=TqQ0oOzJlai",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Long iterative training processes for Deep Neural Networks (DNNs) are commonly required to achieve state-of-the-art performance in many computer vision tasks. Core-set selection and importance sampling approaches might play a key role in budgeted training regimes, i.e.~when limiting the number of training iterations. The former demonstrate that retaining informative samples is important to avoid large drops in accuracy, and the later aim at dynamically estimating the sample importance to speed-up convergence. This work explores this paradigm and how a budget constraint interacts with importance sampling approaches and data augmentation techniques. We show that under budget restrictions, importance sampling approaches do not provide a consistent improvement over uniform sampling. We suggest that, given a specific budget, the best course of action is to disregard the importance and introduce adequate data augmentation. For example, training in CIFAR-10/100 with 30% of the full training budget, a uniform sampling strategy with certain data augmentation surpasses the performance of 100% budget models trained with standard data augmentation. We conclude from our work that DNNs under budget restrictions benefit greatly from variety in the samples and that finding the right samples to train is not the most effective strategy when balancing high performance with low computational requirements. The code will be released after the review process.",
+    "title": "How Important is Importance Sampling for Deep Budgeted Training?",
+    "authors": [
+      "Eric Arazo",
+      "Diego Ortego",
+      "Paul Albert",
+      "Noel O'Connor",
+      "Kevin McGuinness"
+    ],
+    "emails": [
+      "~Diego_Ortego2",
+      "~Kevin_McGuinness1",
+      "~Noel_O'Connor1",
+      "~Eric_Arazo1",
+      "~Paul_Albert2"
+    ],
+    "rank": 2644
+  },
+  {
+    "url": "https://openreview.net/forum?id=3InxcRQsYLf",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We present VideoGen: a conceptually simple architecture for scaling likelihood based generative modeling to natural videos. VideoGen uses VQ-VAE that learns learns downsampled discrete latent representations of a video by employing 3D convolutions and axial self-attention. A simple GPT-like architecture is then used to autoregressively model the discrete latents using spatio-temporal position encodings. Despite the simplicity in formulation, ease of training and a light compute requirement, our architecture is able to generate samples competitive with state-of-the-art GAN models for video generation on the BAIR Robot dataset, and generate coherent action-conditioned samples based on experiences gathered from the VizDoom simulator. We hope our proposed architecture serves as a reproducible reference for a minimalistic implementation of transformer based video generation models without requiring industry scale compute resources. Samples are available at <a href=\"https://sites.google.com/view/videogen\" target=\"_blank\" rel=\"nofollow\">https://sites.google.com/view/videogen</a>",
+    "title": "VideoGen: Generative Modeling of Videos using VQ-VAE and Transformers",
+    "authors": [
+      "Yunzhi Zhang",
+      "Wilson Yan",
+      "Pieter Abbeel",
+      "Aravind Srinivas"
+    ],
+    "emails": [
+      "~Aravind_Srinivas1",
+      "~Pieter_Abbeel2",
+      "~Wilson_Yan1",
+      "berkeley.edu"
+    ],
+    "rank": 2645
+  },
+  {
+    "url": "https://openreview.net/forum?id=GeOIKynj_V",
+    "ratings": [
+      4,
+      4,
+      5,
+      2
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "abstract": "In recent years, Capsule Networks (CapsNets) have achieved promising results in tasks in the object recognition task thanks to their invariance characteristics towards pose and lighting.\nThey have been proposed as an alternative to relational insensitive and translation invariant Convolutional Neural Networks (CNN). \nIt has been empirically proven that CapsNets are capable of achieving competitive performance while requiring significantly fewer parameters.\nThis is a desirable characteristic for Deep reinforcement learning which is known to be sample-inefficient during training.\nIn this paper, we conduct a systematic analysis to explore the potential of CapsNets-based agents in the deep reinforcement learning setting.\nMore specifically, we compare the performance of a CNN-based agent with a CapsNets-based agent in a deep Q-network using the Atari suite as the testbed of our analysis. \nTo the best of our knowledge, this work constitutes the first CapsNets based deep reinforcement learning model to learn state-action value functions without the need of task-specific adaptation.\nOur results show that, in this setting, CapsNets-based architectures require 92% fewer parameters compared to their CNN-based counterparts.\nMoreover, despite their smaller size, the CapsNets-based agents provide significant boosts in performance (score), ranging between 10% - 77%.\nThis is supported by our empirical results which shows that CapsNets-based agents outperform the CNN-based agent, in a Double-DQN with Prioritized experience replay setting, in eight out of the nine selected environments.",
+    "title": "Playing Atari with Capsule Networks: A systematic comparison of CNN and CapsNets-based agents.",
+    "authors": [
+      "Akash Singh",
+      "Kevin Mets",
+      "Jose Oramas",
+      "Steven Latr\u00e9"
+    ],
+    "emails": [
+      "~Kevin_Mets1",
+      "uantwerpen.be",
+      "~Akash_Singh3",
+      "~Jose_Oramas1"
+    ],
+    "rank": 2646
+  },
+  {
+    "url": "https://openreview.net/forum?id=zspml_qcldq",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "Recent advances in using retrieval components over external knowledge sources have shown impressive results for a variety of downstream tasks in natural language processing. Here, we explore the use of unstructured external knowledge sources of images and their corresponding captions for improving visual question answering (VQA). First, we train a novel alignment model for embedding images and captions in the same space, which achieves state-of-the-art image-caption retrieval performance w.r.t. similar methods.\nSecond, we show that retrieval-augmented multi-modal transformers using the trained alignment model\nsignificantly improve results on VQA over strong baselines.\nWe further conduct extensive experiments to establish the promise of this approach, and examine novel applications for inference time such as hot-swapping indices.",
+    "title": "Cross-Modal Retrieval Augmentation for Multi-Modal Classification",
+    "authors": [
+      "Shir Gur",
+      "Natalia Neverova",
+      "Chris Stauffer",
+      "Ser-Nam Lim",
+      "Douwe Kiela",
+      "Austin Reiter"
+    ],
+    "emails": [
+      "~Natalia_Neverova1",
+      "~Austin_Reiter3",
+      "~Shir_Gur1",
+      "~Douwe_Kiela1",
+      "~Ser-Nam_Lim3",
+      "~Chris_Stauffer2"
+    ],
+    "rank": 2647
+  },
+  {
+    "url": "https://openreview.net/forum?id=uLwplzQgAk7",
+    "ratings": [
+      4,
+      4,
+      8,
+      2
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      3,
+      2,
+      4
+    ],
+    "abstract": "We connect a large class of Deep Generative Networks (DGNs) with spline operators in order to derive their properties, limitations, and new opportunities.\nBy characterizing the latent space partition and per-region affine mappings, we relate the manifold dimension and approximation error to the sample size, provide a theoretically motivated ``elbow method'' to infer the intrinsic dimension of the manifold being approximated, study the (not always) beneficial impact of dropout/dropconnect, and demonstrate how the DGN generated piecewise affine surface can be ``smoothed'' to facilitate latent space optimization as used in inverse problems.\nThe per-region affine subspace defines a local coordinate basis; we provide necessary and sufficient conditions relating those basis vectors with disentanglement and demonstrate how to interpret the DGN learned parameters. \nWe also derive the output probability density mapped onto the generated manifold in terms of the latent space density, which enables the computation of key statistics such as its Shannon entropy. \nThis finding also enables us to highlight the source of training instabilities when the target density is multimodal: as the modes gets further apart and/or more concentrated, as the approximant DGN per-region mappings must have increasing singular values leading to large amplitude layer weights and causing training instabilities.",
+    "title": "Max-Affine Spline Insights Into Deep Generative Networks",
+    "authors": [
+      "Randall Balestriero",
+      "Sebastien Paris",
+      "Richard Baraniuk"
+    ],
+    "emails": [
+      "lis-lab.fr",
+      "~Richard_Baraniuk1",
+      "~Randall_Balestriero1"
+    ],
+    "rank": 2648
+  },
+  {
+    "url": "https://openreview.net/forum?id=YdFV691VRcg",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Recent metric learning approaches parametrize semantic similarity measures through the use of an encoder trained along with a similarity model, which operates over pairs of representations. We extend such a setting and enable its use in tasks including multi-class classification in order to tackle known issues observed in standard classifiers such as their lack of robustness to out-of-distribution data. We do so by further learning a set of class prototypes, each one representing a particular class. Training is carried out so that each encoded example is pushed towards the prototype corresponding to its class, and test instances are assigned to the class corresponding to the prototype they are closest to. We thus provide empirical evidence showing the proposed setting is able to match object recognition performance of standard classifiers on common benchmarks, while presenting much improved robustness to adversarial examples and distribution shifts. We further show such a model is effective for tasks other than classification, including those requiring pairwise comparisons such as verification and retrieval. Finally, we discuss a simple scheme for few-shot learning of new classes where only the set of prototypes needs to be updated, yielding competitive performance.",
+    "title": "Learning Semantic Similarities for Prototypical Classifiers",
+    "authors": [
+      "Joao Monteiro",
+      "Isabela Albuquerque",
+      "Jahangir Alam",
+      "Tiago Falk"
+    ],
+    "emails": [
+      "~Joao_Monteiro1",
+      "~Jahangir_Alam1",
+      "~Isabela_Albuquerque1",
+      "~Tiago_Falk1"
+    ],
+    "rank": 2649
+  },
+  {
+    "url": "https://openreview.net/forum?id=Q5ZxoD2LqcI",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "In recent years, there has been a resurgence in research on empirical methods for machine translation. Most of this research has been focused on high-resource, European languages. Despite the fact that around 30% of all languages spoken worldwide are African, the latter have been heavily under investigated and this, partly due to the lack of public parallel corpora online. Furthermore, despite their large number (more than 2,000) and the similarities between them, there is currently no publicly available study on how to use this multilingualism (and associated similarities) to improve machine translation systems performance on African languages. So as to address these issues: \nWe propose a new dataset for African languages that provides parallel data for vernaculars not present in commonly used dataset like JW300 [1]. To exploit multilingualism, we first use a historical approach based on historical origins of these languages, their morphologies, their geographical and cultural distributions as well as migrations of population to identify similar vernaculars.\nWe also  propose a new metric to automatically evaluate similarities between languages. This new metric does not require word level parallelism like traditional methods but only paragraph level parallelism.\nWe then show that performing Masked Language Modelling and Translation Language Modeling in addition to multi-task learning on a cluster of similar languages leads to a strong boost of performance in translating individual pairs inside this cluster.\nIn particular, we record an improvement of 29 BLEU on the pair Bafia-Ewondo using our approaches compared to previous work methods that did not exploit multilingualism in any way.\n\n[1] <a href=\"http://opus.nlpl.eu/JW300.php\" target=\"_blank\" rel=\"nofollow\">http://opus.nlpl.eu/JW300.php</a>",
+    "title": "On the use of linguistic similarities to improve Neural Machine Translation for African Languages",
+    "authors": [
+      "Tikeng Notsawo Pascal",
+      "NANDA ASSOBJIO Brice Yvan",
+      "James Assiene"
+    ],
+    "emails": [
+      "~NANDA_ASSOBJIO_Brice_Yvan1",
+      "~Tikeng_Notsawo_Pascal1",
+      "~James_Assiene1"
+    ],
+    "rank": 2650
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZpNfWV6XcV1",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "While large neural networks demonstrate higher performance in various tasks, training large networks is difficult due to limitations on GPU memory size. \nWe propose a novel out-of-core algorithm that enables faster training of extremely large-scale neural networks with sizes larger than allotted GPU memory. \nUnder a given memory budget constraint, our scheduling algorithm locally adapts the timing of memory transfers according to memory usage of each function, which improves overlap between computation and memory transfers. \nAdditionally, we apply virtual addressing technique, commonly performed in OS, to training of neural networks with out-of-core execution, which drastically reduces the amount of memory fragmentation caused by frequent memory transfers. With our proposed algorithm, we successfully train ResNet-50 with 1440 batch-size with keeping training speed at 55\\%, which is 7.5x larger than the upper bound of physical memory. It also outperforms a previous state-of-the-art substantially, i.e. it trains a 1.55x larger network than state-of-the-art with faster execution. Moreover, we experimentally show that our approach is also scalable for various types of networks.",
+    "title": "Out-of-Core Training for Extremely Large-Scale Neural Networks with Adaptive Window-Based Scheduling",
+    "authors": [
+      "Akio Hayakawa",
+      "Takuya Narihira"
+    ],
+    "emails": [
+      "~Akio_Hayakawa1",
+      "~Takuya_Narihira2"
+    ],
+    "rank": 2651
+  },
+  {
+    "url": "https://openreview.net/forum?id=36G2rwDbk1k",
+    "ratings": [
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      3,
+      5
+    ],
+    "abstract": "Detecting feature's predictive power is a key problem in Machine Learning. Previous methods have been focusing on providing a single value, usually named feature importance, as a point estimate of the power. However, it is difficult to interpret the predictive power using feature importance. Moreover, in reality feature's predictive power may vary dramatically across feature values. Feature importance, as a point estimate, cannot capture such variance. To address the two problems, we first propose a new definition of feature importance to directly measure feature's predictive power. We then propose a feature importance model to capture a high-resolution distribution of feature importance across feature values. Last we propose a binarized logistic regression model and its learning algorithm to train the feature importance models jointly. We theoretically proved that our approach has the same time complexity as Logistic Regression. Empirical results on three real-world biomedical datasets show that, our approach can detect meaningful feature importance distributions, which could have profound sociological implications. Code, data and full results are publicly available in paper github repository. All the results are reproducible by simply using one command.",
+    "title": "On the Discovery of Feature Importance Distribution: An Overlooked Area",
+    "authors": [
+      "Yuxiao Huang"
+    ],
+    "emails": [
+      "~Yuxiao_Huang1"
+    ],
+    "rank": 2652
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z6bT159ZA2E",
+    "ratings": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Object detection remains one of the most notorious open problems in computer\nvision. Despite large strides in accuracy and speed in recent years, modern object\ndetectors have started to saturate on popular benchmarks. How far can we push\nthe detection accuracy with the current deep learning tools and tricks? In this\nwork, by employing two popular state-of-the-art object detection benchmarks,\nMMDetection and Detectron2, and analyzing more than 15 models over 4\nlarge-scale datasets, we systematically determine the upper bound in AP, which is\n91.6% on PASCAL VOC (test2007), 78.2% on MS COCO (val2017), and 58.9%\non OpenImages (V4 validation set), regardless of the IOU. These numbers are\nmuch higher than the mAP of the best model (e.g., 58% on MS COCO according\nto the most recent results). Interestingly, the gap seems to be almost closed at\nIOU=0.5. We also analyze the role of context in object recognition and detection\nand find that the canonical object size leads to the best recognition accuracy.\nFinally, we carefully characterize the sources of errors in deep object detectors and\nfind that classification error (confusion with other classes and misses) explains the\nlargest fraction of errors and weighs more than localization error. Further, models\nfrequently miss small objects, more often than medium and large ones. Our work\ntaps into the tight relationship between object recognition and detection and offers\ninsights to build better object detectors. Similar analyses can also be conducted\nfor other tasks in computer vision such as for instance segmentation and object\ntracking. The code is available at [TBA].",
+    "title": "EMPIRICAL UPPER BOUND IN OBJECT DETECTION",
+    "authors": [
+      "ali borji"
+    ],
+    "emails": [
+      "~ali_borji1"
+    ],
+    "rank": 2653
+  },
+  {
+    "url": "https://openreview.net/forum?id=ABZSAe9gNeg",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Machine learning practitioners frequently seek to leverage the most informative available data, without violating the data owner's privacy, when building predictive models. Differentially private data synthesis protects personal details from exposure, and allows for the training of differentially private machine learning models on privately generated datasets. But how can we effectively assess the efficacy of differentially private synthetic data? In this paper, we survey four differentially private generative adversarial networks for data synthesis. We evaluate each of them at scale on five standard tabular datasets, and in two applied industry scenarios. We benchmark with novel metrics from recent literature and other standard machine learning tools. Our results suggest some synthesizers are more applicable for different privacy budgets, and we further demonstrate complicating domain-based tradeoffs in selecting an approach. We offer experimental learning on applied machine learning scenarios with private internal data to researchers and practitioners alike. In addition, we propose QUAIL, a two model hybrid approach to generating synthetic data. We examine QUAIL's tradeoffs, and note circumstances in which it outperforms baseline differentially private supervised learning models under the same budget constraint.",
+    "title": "Differentially Private Synthetic Data: Applied Evaluations and Enhancements",
+    "authors": [
+      "Lucas Rosenblatt",
+      "Xiaoyan Liu",
+      "Samira Pouyanfar",
+      "Eduardo de Leon",
+      "Anuj Desai",
+      "Joshua Allen"
+    ],
+    "emails": [
+      "microsoft.com",
+      "~Lucas_Rosenblatt1",
+      "~Xiaoyan_Liu1"
+    ],
+    "rank": 2654
+  },
+  {
+    "url": "https://openreview.net/forum?id=zmgJIjyWSOw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "This paper extends the BERT model to user data for pretraining user representations in a self-supervised way. By viewing actions (e.g., purchases and clicks) in behavior sequences (i.e., usage history) in an analogous way to words in sentences, we propose methods for the tokenization, the generation of input representation vectors and a novel pretext task to enable the pretraining model to learn from its own input, omitting the burden of collecting additional data. Further, our model adopts a unified structure to simultaneously learn from long-term and short-term user behavior as well as user profiles. Extensive experiments demonstrate that the learned representations  result in significant improvements when transferred to three different real-world tasks, particularly in comparison with task-specific modeling and representations obtained from multi-task learning. ",
+    "title": "UserBERT: Self-supervised User Representation Learning",
+    "authors": [
+      "Tianyu Li",
+      "Ali Cevahir",
+      "Derek Cho",
+      "Hao Gong",
+      "DuyKhuong Nguyen",
+      "Bjorn Stenger"
+    ],
+    "emails": [
+      "rakuten.com",
+      "~Tianyu_Li2"
+    ],
+    "rank": 2655
+  },
+  {
+    "url": "https://openreview.net/forum?id=hTUPgfEobsm",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      2,
+      2
+    ],
+    "abstract": "This paper proposes Affine Disentangled GAN (ADIS-GAN), which is a Generative Adversarial Network that can explicitly disentangle affine transformations in a self-supervised and rigorous manner.  The objective is inspired by InfoGAN, where an additional affine regularizer acts as the inductive bias.  The affine regularizer is rooted in the affine transformation properties of images, changing some properties of the underlying images, while leaving all other properties invariant. We derive the affine regularizer by decomposing the affine matrix into separate transformation matrices and inferring the transformation parameters by maximum-likelihood estimation.  Unlike the disentangled representations learned by existing approaches, the features learned by ADIS-GAN are axis-aligned and scalable, where transformations such as rotation, horizontal and vertical zoom, horizontal and vertical skew,  horizontal and vertical translation can be explicitly selected and learned. ADIS-GAN successfully disentangles these features on the MNIST, CelebA, and dSprites datasets.",
+    "title": "ADIS-GAN: Affine Disentangled GAN",
+    "authors": [
+      "Letao Liu",
+      "Martin Saerbeck",
+      "Justin Dauwels"
+    ],
+    "emails": [
+      "~Letao_Liu1",
+      "ntu.edu.sg",
+      "tuvsud.com"
+    ],
+    "rank": 2656
+  },
+  {
+    "url": "https://openreview.net/forum?id=cCwbeIZfOqX",
+    "ratings": [
+      7,
+      3,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      4,
+      2
+    ],
+    "abstract": "Causal visual discovery is a fundamental yet challenging problem in many research fields. Given visual data and the outcome of interest, the goal is to infer the cause-effect relation. Aside from rich visual ('visible') variables, oftentimes, the outcome is also determined by 'invisible' variables, i.e. the variables from non-visual modalities that do not have visual counterparts. This <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mtext class=\"mjx-b\"><mjx-c class=\"mjx-c1D42F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42C TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D425 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c></mjx-mtext><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mtext class=\"mjx-b\" space=\"2\"><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D427 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42F TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D42C TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D422 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41B TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D425 TEX-B\"></mjx-c><mjx-c class=\"mjx-c1D41E TEX-B\"></mjx-c></mjx-mtext><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mtext mathvariant=\"bold\">visible</mtext><mo>,</mo><mtext mathvariant=\"bold\">invisible</mtext><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> combination is particularly common in the clinical domain.\n\nBuilt upon the promising invariant causal prediction (ICP) framework, we propose a novel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-ICP algorithm to resolve the (visible, invisible) setting. To efficiently discover <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-plausible causal variables and to estimate the cause-effect relation, the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-ICP is learned under a min-min optimisation scheme. Driven by the need for clinical reliability and interpretability, the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-ICP is implemented with a typed neural-symbolic functional language. With the built-in program synthesis method, we can synthesize a type-safe program that is comprehensible to the clinical experts.\n\nFor concept validation of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-ICP, we carefully design a series of synthetic experiments on the type of visual-perception tasks that are encountered in daily life. To further substantiate the proposed method, we demonstrate the application of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-ICP on a real-world cancer study dataset, Swiss CRC. This population-based cancer study has spanned over two decades, including 25<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> fully annotated tissue micro-array (TMA) images with at least <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn><mi>k</mi><mo>\u00d7</mo><mn>3</mn><mi>k</mi></math></mjx-assistive-mml></mjx-container> resolution and a broad spectrum of clinical meta data for 533 patients. Both the synthetic and clinical experiments demonstrate the advantages of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-ICP over the state-of-the-art methods. Finally, we discuss the limitations and challenges to be addressed in the future.\n",
+    "title": "Visible and Invisible: Causal Variable Learning and its Application in a Cancer Study",
+    "authors": [
+      "Jiqing Wu",
+      "Inti Zlobec",
+      "Viktor K\u00f6lzer"
+    ],
+    "emails": [
+      "~Jiqing_Wu1",
+      "pathology.unibe.ch",
+      "usz.ch"
+    ],
+    "rank": 2657
+  },
+  {
+    "url": "https://openreview.net/forum?id=2isb_482lP",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      6
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      4,
+      4,
+      2
+    ],
+    "abstract": "In this work, we study the behavior of standard models for community detection under spectral manipulations. Through various ablation experiments, we evaluate the impact of bandpass filtering on the numerical performances of a GCN: we empirically show that most of the necessary and used information for nodes classification is contained in the low-frequency domain, and thus contrary to Euclidean graph (e.g., images),  high-frequencies are less crucial to community detection. In particular, it is possible to obtain accuracies at a state-of-the-art level with simple classifiers that rely only on a few low frequencies: this is surprising because contrary to GCNs, no cascade of filtering along the graph structure is involved and it indicates that the important spectral components for the supervised community detection task are essentially in the low-frequency domain.",
+    "title": "A Spectral Perspective on Deep Supervised Community Detection",
+    "authors": [
+      "Nathan Grinsztajn",
+      "Philippe Preux",
+      "Edouard Oyallon"
+    ],
+    "emails": [
+      "~Nathan_Grinsztajn1",
+      "~Philippe_Preux1",
+      "~Edouard_Oyallon1"
+    ],
+    "rank": 2658
+  },
+  {
+    "url": "https://openreview.net/forum?id=2KSsaPGemn2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Reinforcement Learning algorithms have reached new heights in performance, often overtaking humans on several challenging tasks such as Atari and Go. However, the resulting models learn fragile policies that are unable to transfer between tasks without full retraining. Successor features aim to improve this situation by decomposing the policy into two components: one capturing environmental dynamics and the other modelling reward. Under this framework, transfer between related tasks requires only training the reward component. However, successor features builds upon the limiting assumption that the current reward can be predicted from a linear combination of state features. This paper proposes a novel improvement to the successor feature framework, where we instead assume that the reward function is a non-linear function of the state features, thereby increasing its representational power. After derivation of the new state-action value function, the decomposition includes a second term that learns the auto-correlation matrix between state features. Experimentally, we show this term explicitly models the environment's stochasticity and can also be used in place of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-greedy exploration methods during transfer. The performance of the proposed improvements to the successor feature framework is validated empirically on navigation tasks and control of a simulated robotic arm.",
+    "title": "Non-Linear Rewards For Successor Features",
+    "authors": [
+      "Norman L Tasfi",
+      "Miriam Capretz"
+    ],
+    "emails": [
+      "~Miriam_Capretz1",
+      "~Norman_L_Tasfi1"
+    ],
+    "rank": 2659
+  },
+  {
+    "url": "https://openreview.net/forum?id=-5W5OBfFlwX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      3
+    ],
+    "abstract": "EXP-based algorithms are often used for exploration in multi-armed bandit. We revisit the EXP3.P algorithm and establish both the lower and upper bounds of regret in the Gaussian multi-armed bandit setting, as well as a more general distribution option. The analyses do not require bounded rewards compared to classical regret assumptions. We also extend EXP4 from multi-armed bandit to reinforcement learning to incentivize exploration by multiple agents. The resulting algorithm has been tested on hard-to-explore games and it shows an improvement on exploration compared to state-of-the-art.",
+    "title": "Regret Bounds and Reinforcement Learning Exploration of EXP-based Algorithms",
+    "authors": [
+      "Mengfan Xu",
+      "Diego Klabjan"
+    ],
+    "emails": [
+      "~Mengfan_Xu1",
+      "~Diego_Klabjan1"
+    ],
+    "rank": 2660
+  },
+  {
+    "url": "https://openreview.net/forum?id=k16LHiZVGmF",
+    "ratings": [
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      2,
+      4
+    ],
+    "abstract": "Predicting future discontinuous phenomena that are unobservable from the training data sets has been a challenging problem for scientific machine learning. In this paper, we introduce a novel learning paradigm to predict the emergence and evolution of various kinds of discontinuities for hyperbolic dynamic systems based on smooth observation data. At the heart of our approach is a templaterizable and data-driven Riemann solver that functions as a strong inductive prior to tackle the potential discontinuities.\nThe key design of our templaterized Riemann approximator is inspired by the classical Roe solver (P. L. Roe, J. Comput. Phys., vol. 43, 1981), which served as a fundamental mathematical tool for simulating various hyperbolic systems in computational physics. \nBy carefully designing the computing primitives, data flow, and incorporating a novel pseudoinverse processing module, we enable our data-driven predictor to inherently satisfy all the essential mathematical criteria of a Roe hyperbolic solver derived from the first principles and hence deliver accurate predictions of hyperbolic dynamics.",
+    "title": "RoeNets: Predicting Discontinuity of Hyperbolic Systems from Continuous Data",
+    "authors": [
+      "Shiying Xiong",
+      "Xingzhe He",
+      "Shuqi Yang",
+      "Yunjin Tong",
+      "Runze Liu",
+      "Bo Zhu"
+    ],
+    "emails": [
+      "dartmouth.edu",
+      "~Bo_Zhu2",
+      "~Shiying_Xiong1",
+      "~Xingzhe_He1",
+      "~Shuqi_Yang2"
+    ],
+    "rank": 2661
+  },
+  {
+    "url": "https://openreview.net/forum?id=wqRvVvMbJAT",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "This paper concerns the use of objectness measures to improve the calibration performance of Convolutional Neural Networks (CNNs). CNNs have proven to be very good classifiers and generally localize objects well; however, the loss functions typically used to train classification CNNs do not penalize inability to localize an object, nor do they take into account an object's relative size in the given image. During training on ImageNet-1K almost all approaches use random crops on the images and this transformation sometimes provides the CNN with background only samples. This causes the classifiers to depend on context. Context dependence is harmful for safety-critical applications. We present a novel approach to classification that combines the ideas of objectness and label smoothing during training. Unlike previous methods, we compute a smoothing factor that is \\emph{adaptive} based on relative object size within an image. This causes our approach to produce confidences that are grounded in the size of the object being classified instead of relying on context to make the correct predictions. We present extensive results using ImageNet to demonstrate that CNNs trained using adaptive label smoothing are much less likely to be overconfident in their predictions. We show qualitative results using class activation maps and quantitative results using classification and transfer learning tasks. Our approach is able to produce an order of magnitude reduction in confidence when predicting on context only images when compared to baselines. Using transfer learning, we gain <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.021</mn></math></mjx-assistive-mml></mjx-container>AP on MS COCO compared to the hard label approach.",
+    "title": "One Size Doesn't Fit All: Adaptive Label Smoothing",
+    "authors": [
+      "Ujwal Krothapalli",
+      "Lynn Abbott"
+    ],
+    "emails": [
+      "~Lynn_Abbott1",
+      "~Ujwal_Krothapalli1"
+    ],
+    "rank": 2662
+  },
+  {
+    "url": "https://openreview.net/forum?id=BKIS2NCUro9",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Variational autoencoder (VAE) is a generation algorithm, consisting of an encoder and a decoder, and the latent variable from the encoder is used as the input of the decoder. \nVAE is widely used for image, audio and text generation tasks. In general, the training of VAE is at risk of posterior collapsing especially for long sequential data. To alleviate this, modified evidence lower bounds (ELBOs) were proposed. However, these approaches heuristically control training loss using a hyper-parameter, and it is not way to solve the fundamental problem of vanilla VAE.\nIn this paper, we propose a method to insert an optimization step of the latent variable and alternately update the encoder and decoder of conditional VAE for maximizing ELBOs. \nIn experiments, we applied the latent optimization VAE (LOVAE) on ZINC database, consisting of string representation of molecules, for the inverse molecular design. \nWe showed that the proposed LOVAE achieves better performance than vanilla VAE in terms of ELBOs and molecular generation performance. In addition, the proposed method showed better performance in property satisfaction and property maximization tasks compared to existing works.",
+    "title": "LATENT OPTIMIZATION VARIATIONAL AUTOENCODER FOR CONDITIONAL MOLECULAR GENERATION",
+    "authors": [
+      "Kisoo Kwon",
+      "Jung-Hyun Park",
+      "Kuhwan Jeong",
+      "Sunjae Lee",
+      "Hoshik Lee"
+    ],
+    "emails": [
+      "~Kuhwan_Jeong1",
+      "~Jung-Hyun_Park1",
+      "~Kisoo_Kwon1",
+      "~Hoshik_Lee1",
+      "~Sunjae_Lee1"
+    ],
+    "rank": 2663
+  },
+  {
+    "url": "https://openreview.net/forum?id=ZlIfK1wCubc",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": " We propose a self-supervised method that builds sentence embeddings from the combination of diverse explicit syntactic structures of a sentence. We assume structure is crucial to build consistent representations as we expect sentence meaning to be a function from both syntax and semantic aspects. In this perspective, we hypothesize that some linguistic representations might be better adapted given the considered task or sentence. We, therefore, propose to jointly learn individual representation functions for different syntactic frameworks. Again, by hypothesis, all such functions should encode similar semantic information differently and consequently, be complementary for building better sentential semantic embeddings. To assess such hypothesis, we propose an original contrastive multi-view framework that induces an explicit interaction between models during the training phase. We make experiments combining various structures such as dependency, constituency, or sequential schemes. We evaluate our method on standard sentence embedding benchmarks. Our results outperform comparable methods on several tasks.\n\n\n",
+    "title": "Contrasting distinct structured views to learn sentence embeddings",
+    "authors": [
+      "Antoine Simoulin",
+      "Benoit Crabb\u00e9"
+    ],
+    "emails": [
+      "~Antoine_Simoulin1",
+      "gmail.com"
+    ],
+    "rank": 2664
+  },
+  {
+    "url": "https://openreview.net/forum?id=A_MbFRk3qT",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      3,
+      4
+    ],
+    "abstract": "Most non-linear neural networks are known to have poor local minima (Yun et al. (2019)) and it is shown that training a neural network is NP-hard (Blum &amp; Rivest (1988)). A line of work has studied the global optimality of neural networks in various settings but unfortunately all previous networks without spurious local minima are linear networks or networks with unrealistic assumptions. In this work we demonstrate for the first time that a non-linear neural network can have no poor local minima under no assumptions.\nRecently, a number of papers considered complex-valued neural networks (CVNNs) in various settings and suggest that CVNNs have competitive or even preferable behaviour compared to real-valued networks. Unfortunately, there is currently no theoretical analysis on the optimization of complex-valued networks, given that complex functions usually have a disparate optimization landscape. This is the first work towards analysing the optimization landscape of CVNNs. We prove a surprising result that no spurious local minima exist for one hidden layer complex-valued neural networks with quadratic activation. Since CVNNs can have real-valued datasets and there are no assumptions, our results are applicable to practical networks. Along the way, we develop a novel set of tools and techniques for analyzing the optimization of CVNNs, which may be useful in other contexts. Lastly, we prove spurious local minima exist for CVNNs with non-analytic CReLU activation.",
+    "title": "Complex neural networks have no spurious local minima",
+    "authors": [
+      "Xingtu Liu"
+    ],
+    "emails": [
+      "~Xingtu_Liu1"
+    ],
+    "rank": 2665
+  },
+  {
+    "url": "https://openreview.net/forum?id=JNP-CqSjkDb",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transformer has achieved state of the art performance in multiple Natural Language Processing tasks recently. Yet the Feed Forward Network(FFN) in a Transformer block is computationally expensive. In this paper, we present a framework to transform Recurrent Neural Networks(RNNs) and their variants into self-attention-style models, with an approximation of Banach Fixed-point Theorem. Within this framework, we propose a new model, StarSaber, by solving a set of equations obtained from RNN with Fixed-point Theorem and further approximate it with a Multi-layer Perceptron. It provides a view of stacking layers. StarSaber achieves better performance than both the vanilla Transformer and an improved version called ReZero on three datasets and is more computationally efficient, due to the reduction of Transformer's FFN layer. It has two major parts. One is a way to encode position information with two different matrices. For every position in a sequence, we have a matrix operating on positions before it and another matrix operating on positions after it. The other is the introduction of direct paths from the input layer to the rest of layers. Ablation studies show the effectiveness of these two parts. We additionally show that other RNN variants such as RNNs with gates can also be transformed in the same way, outperforming the two kinds of Transformers as well.",
+    "title": "Transforming Recurrent Neural Networks with Attention and Fixed-point Equations",
+    "authors": [
+      "Zhaobin Xu",
+      "Baotian Hu",
+      "Buzhou Tang"
+    ],
+    "emails": [
+      "~Buzhou_Tang1",
+      "~Zhaobin_Xu2",
+      "~Baotian_Hu1"
+    ],
+    "rank": 2666
+  },
+  {
+    "url": "https://openreview.net/forum?id=mo3Uqtnvz_",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Many commonly-used detection frameworks aim to handle the multi-scale object detection problem. The input image is always encoded to multi-scale features and objects grouped by scale range are assigned to the corresponding features. However, the design of multi-scale feature production is quite hand-crafted or partially automatic. In this paper, we show that more possible architectures of encoder network and different strategies of feature utilization can lead to superior performance. Specifically, we propose an efficient and effective multi-scale network architecture search method (MSNAS) to improve multi-scale object detection by jointly optimizing network stride search of the encoder and appropriate feature selection for detection heads. We demonstrate the effectiveness of the method on COCO dataset and obtain a remarkable performance gain with respect to the original Feature Pyramid Networks.",
+    "title": "Multi-scale Network Architecture Search for Object Detection",
+    "authors": [
+      "Yuxin Yue",
+      "Quanquan Li",
+      "Yujie Wang"
+    ],
+    "emails": [
+      "~Yuxin_Yue1",
+      "~Quanquan_Li1",
+      "~Yujie_Wang2"
+    ],
+    "rank": 2667
+  },
+  {
+    "url": "https://openreview.net/forum?id=zFM0Uo_GnYE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Data rarely lies on uniquely Euclidean spaces. Even data typically represented in regular domains, such as images, can have a higher level of relational information, either between data samples or even relations within samples, e.g., how the objects in an image are linked. With this perspective our data points can be enriched by explicitly accounting for this connectivity and analyzing them as a graph. Herein, we analyze various approaches for unsupervised representation learning and investigate the importance of considering topological information and its impact when learning representations. We explore a spectrum of models, ranging from uniquely learning representations based on the isolated features of the nodes (focusing on Variational Autoencoders), to uniquely learning representations based on the topology (using node2vec) passing through models that integrate both node features and topological information in a hybrid fashion. For the latter we use Graph Neural Networks, precisely Deep Graph Infomax (DGI), and an extension of the typical formulation of the VAE where the topological structure is accounted for via an explicit regularization of the loss (Graph-Regularized VAEs, introduced in this work). To extensively investigate these methodologies, we consider a wide variety of data types: synthetic data point clouds, MNIST, citation networks, and chemical reactions. We show that each of the representations learned by these models may have critical importance for further downstream tasks, and that accounting for the topological features can greatly improve the modeling capabilities for certain problems. We further provide a framework to analyze these, and future models under different scenarios and types of data.",
+    "title": "On the Importance of Looking at the Manifold",
+    "authors": [
+      "Nil Adell Mill",
+      "Jannis Born",
+      "Nathaniel Park",
+      "James Hedrick",
+      "Mar\u00eda Rodr\u00edguez Mart\u00ednez",
+      "Matteo Manica"
+    ],
+    "emails": [
+      "zurich.ibm.com",
+      "us.ibm.com",
+      "~Jannis_Born1",
+      "~Matteo_Manica1",
+      "~Nil_Adell_Mill1",
+      "~James_Hedrick1"
+    ],
+    "rank": 2668
+  },
+  {
+    "url": "https://openreview.net/forum?id=Fo6S5-3Dx_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we propose a deep evolutionary learning (DEL) process that integrates  fragment-based deep generative model and multi-objective evolutionary computation for molecular design. Our approach enables (1) evolutionary operations in the latent space of the generative model, rather than the structural space, to generate novel promising molecular structures for the next evolutionary generation, and (2) generative model fine-tuning using newly generated high-quality samples. Thus, DEL implements a data-model co-evolution concept which improves both sample population and generative model learning. Experiments on two public datasets indicate that sample population obtained by DEL exhibits improved property distributions, and dominates samples generated by multi-objective Bayesian optimization algorithms.",
+    "title": "Deep Evolutionary Learning for Molecular Design",
+    "authors": [
+      "Yifeng Li",
+      "Hsu Kiang Ooi",
+      "Alain Tchagang"
+    ],
+    "emails": [
+      "~Hsu_Kiang_Ooi1",
+      "nrc-cnrc.gc.ca",
+      "~Yifeng_Li1"
+    ],
+    "rank": 2669
+  },
+  {
+    "url": "https://openreview.net/forum?id=nxJ8ugF24q2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Standard formulations of GANs, where a continuous function deforms a connected latent space, have been shown to be misspecified when fitting disconnected manifolds. In particular, when covering different classes of images, the generator will necessarily sample some low quality images in between the modes. Rather than modify the learning procedure, a line of works aims at improving the sampling quality from trained generators. Thus, it is now common to introduce a rejection step within the generation procedure.\nBuilding on this, we propose to train an additional network and transform the latent space via an adversarial learning of importance weights. This idea has several advantages: 1) it provides a way to inject disconnectedness on any GAN architecture, 2) the rejection avoids going through both the generator and the discriminator saving computation time, 3) this importance weights formulation provides a principled way to estimate the Wasserstein's distance to the true distribution, enabling its minimization. We demonstrate the effectiveness of our method on different datasets, both synthetic and high dimensional, and stress its superiority on highly disconnected data.",
+    "title": "Learning Disconnected Manifolds: Avoiding The No Gan's Land by Latent Rejection",
+    "authors": [
+      "Thibaut Issenhuth",
+      "Ugo Tanielian",
+      "David Picard",
+      "Jeremie Mary"
+    ],
+    "emails": [
+      "~Jeremie_Mary1",
+      "~Thibaut_Issenhuth1",
+      "~David_Picard1",
+      "~Ugo_Tanielian1"
+    ],
+    "rank": 2670
+  },
+  {
+    "url": "https://openreview.net/forum?id=D3TNqCspFpM",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      2,
+      4,
+      4
+    ],
+    "abstract": "As an important problem of causal inference, we discuss the estimation of treatment effects under the existence of unobserved confounding. By representing the confounder as a latent variable, we propose Counterfactual VAE, a new variant of variational autoencoder, based on recent advances in identifiability of representation learning. Combining the identifiability and classical identification results of causal inference, under mild assumptions on the generative model and with small noise on the outcome, we theoretically show that the confounder is identifiable up to an affine transformation and then the treatment effects can be identified.  Experiments on synthetic and semi-synthetic datasets demonstrate that our method matches the state-of-the-art, even under settings violating our formal assumptions.",
+    "title": "Identifying Treatment Effects under Unobserved Confounding by Causal Representation Learning",
+    "authors": [
+      "Pengzhou Abel Wu",
+      "Kenji Fukumizu"
+    ],
+    "emails": [
+      "~Pengzhou_Abel_Wu1",
+      "~Kenji_Fukumizu1"
+    ],
+    "rank": 2671
+  },
+  {
+    "url": "https://openreview.net/forum?id=RwQZd8znR10",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Deep reinforcement learning algorithms generally require large amounts of data to solve a single task. Meta reinforcement learning (meta-RL) agents learn to adapt to novel unseen tasks with high sample efficiency by extracting useful prior knowledge from previous tasks. Despite recent progress, efficient exploration in meta-training and adaptation remains a key challenge in sparse-reward meta-RL tasks. We propose a novel off-policy meta-RL algorithm to address this problem, which disentangles exploration and exploitation policies and learns intrinsically motivated exploration behaviors. We design novel intrinsic rewards derived from information gain to reduce task uncertainty and encourage the explorer to collect informative trajectories about the current task. Experimental evaluation shows that our algorithm achieves state-of-the-art performance on various sparse-reward MuJoCo locomotion tasks and more complex Meta-World tasks.",
+    "title": "Intrinsically Guided Exploration in Meta Reinforcement Learning",
+    "authors": [
+      "Jin Zhang",
+      "Jianhao Wang",
+      "Hao Hu",
+      "Tong Chen",
+      "Yingfeng Chen",
+      "Changjie Fan",
+      "Chongjie Zhang"
+    ],
+    "emails": [
+      "~Yingfeng_Chen1",
+      "~Tong_Chen3",
+      "~Jin_Zhang6",
+      "~Chongjie_Zhang1",
+      "~Changjie_Fan1",
+      "~Hao_Hu3",
+      "~Jianhao_Wang1"
+    ],
+    "rank": 2672
+  },
+  {
+    "url": "https://openreview.net/forum?id=SNT0H2HbeRq",
+    "ratings": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Convolutional  Networks  (ConvNets) suffer from insufficient  robustness to common corruptions and perturbations of the input, unseen during training. We address this problem by including a form of response inhibition in the processing of the early layers of existing ConvNets and assess the resulting representation power and generalization on corrupted inputs. The considered inhibition mechanism  consists of a non-linear computation that is inspired  by the  push-pull  inhibition exhibited by some neurons in the visual system of the brain. In practice, each convolutional filter (push) in the early layers of conventional ConvNets is coupled with another filter (pull), which responds to the preferred pattern of the corresponding push filter but of opposite contrast. The rectified responses of the push and pull filter pairs are then combined by a linear function. This results in a representation that suppresses responses to noisy patterns (e.g. texture, Gaussian, shot, distortion, and others) and accentuates responses to preferred patterns. \n\nWe deploy the layer into existing architectures, (Wide-)ResNet  and  DenseNet, and propose new residual and dense push-pull layers. We demonstrate that ConvNets that embed this inhibition into the initial layers are able to learn representations that are robust to several types of input corruptions.  We validated the approach on the ImageNet  and  CIFAR  data  sets and their corrupted and perturbed versions, ImageNet-C/P and CIFAR-C/P. It turns out that the push-pull inhibition enhances the overall robustness and generalization of ConvNets to corrupted and perturbed input data. Besides the improvement in generalization, notable is the fact that ConvNets with push-pull inhibition have a sparser representation than conventional ones without inhibition.  The code and trained models will be made available.",
+    "title": "Inhibition-augmented ConvNets",
+    "authors": [
+      "Nicola Strisciuglio",
+      "George Azzopardi",
+      "Nicolai Petkov"
+    ],
+    "emails": [
+      "~Nicola_Strisciuglio2",
+      "~George_Azzopardi2",
+      "~Nicolai_Petkov1"
+    ],
+    "rank": 2673
+  },
+  {
+    "url": "https://openreview.net/forum?id=gQRTfX0B8d8",
+    "ratings": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "The tremendous progress of autoencoders and generative adversarial networks (GANs) has led to their application to multiple critical tasks, such as fraud detection and sanitized data generation. This increasing adoption has fostered the study of security and privacy risks stemming from these models. However, previous works have mainly focused on membership inference attacks. In this work, we explore one of the most severe attacks against machine learning models, namely the backdoor attack, against both autoencoders and GANs. The backdoor attack is a training time attack where the adversary implements a hidden backdoor in the target model that can only be activated by a secret trigger. State-of-the-art backdoor attacks focus on classification-based tasks. We extend the applicability of backdoor attacks to autoencoders and GAN-based models. More concretely, we propose the first backdoor attack against autoencoders and GANs where the adversary can control what the decoded or generated images are when the backdoor is activated. Our results show that the adversary can build a backdoored autoencoder that returns a target output for all backdoored inputs, while behaving perfectly normal on clean inputs. Similarly, for the GANs, our experiments show that the adversary can generate data from a different distribution when the backdoor is activated, while maintaining the same utility when the backdoor is not.",
+    "title": "BAAAN: Backdoor Attacks Against Auto-encoder and GAN-Based Machine Learning Models",
+    "authors": [
+      "Ahmed Salem",
+      "Yannick Sautter",
+      "Michael Backes",
+      "Mathias Humbert",
+      "Yang Zhang"
+    ],
+    "emails": [
+      "~Yang_Zhang15",
+      "armasuisse.ch",
+      "gmail.com",
+      "~Ahmed_Salem2",
+      "~Michael_Backes1"
+    ],
+    "rank": 2674
+  },
+  {
+    "url": "https://openreview.net/forum?id=MAF2IYqkEYD",
+    "ratings": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Research in computational neuroscience suggests that the human brain's unparalleled data efficiency is a result of highly efficient mechanisms to extract and organize slowly changing high level features from continuous sensory inputs.\nIn this paper, we apply this \\textit{slowness principle} to a state of the art representation learning method with the goal of performing data efficient learning of down-stream regression tasks.\nTo this end, we propose the \\textit{slow variational autoencoder} (S-VAE), an extension to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE which applies a temporal similarity constraint to the latent representations.\nWe empirically compare our method to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FD TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b2</mi></math></mjx-assistive-mml></mjx-container>-VAE and the Temporal Difference VAE (TD-VAE), a state-of-the-art method for next frame prediction in latent space with temporal abstraction.\nWe evaluate the three methods against their data-efficiency on down-stream tasks using a synthetic 2D ball tracking dataset and a dataset generated using the DeepMind Lab environment.\nIn both tasks, the proposed method outperformed the baselines both with dense and sparse labeled data.\nFurthermore, the S-VAE achieved similar performance compared to the baselines with 1/5 to 1/11 of data.",
+    "title": "Unsupervised Learning of Slow Features for Data Efficient Regression",
+    "authors": [
+      "Oliver Struckmeier",
+      "Kshitij Tiwari",
+      "Ville Kyrki"
+    ],
+    "emails": [
+      "oulu.fi",
+      "~Oliver_Struckmeier1",
+      "aalto.fi"
+    ],
+    "rank": 2675
+  },
+  {
+    "url": "https://openreview.net/forum?id=V6WHleb2nV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Retrosynthesis is a problem to infer reactant compounds to synthesize a given\nproduct compound through chemical reactions. Recent studies on retrosynthesis\nfocus on proposing more sophisticated prediction models, but the dataset to feed\nthe models also plays an essential role in achieving the best generalizing models.\nGenerally, a dataset that is best suited for a specific task tends to be small. In\nsuch a case, it is the standard solution to transfer knowledge from a large or\nclean dataset in the same domain. In this paper, we conduct a systematic and\nintensive examination of data transfer approaches on end-to-end generative models,\nin application to retrosynthesis. Experimental results show that typical data transfer\nmethods can improve test prediction scores of an off-the-shelf Transformer baseline\nmodel. Especially, the pre-training plus fine-tuning approach boosts the accuracy\nscores of the baseline, achieving the new state-of-the-art. In addition, we conduct a\nmanual inspection for the erroneous prediction results. The inspection shows that\nthe pre-training plus fine-tuning models can generate chemically appropriate or\nsensible proposals in almost all cases.",
+    "title": "Data Transfer Approaches to Improve Seq-to-Seq Retrosynthesis",
+    "authors": [
+      "Katsuhiko Ishiguro",
+      "Kazuya Ujihara",
+      "Ryohto Sawada",
+      "Hirotaka Akita",
+      "Masaaki Kotera"
+    ],
+    "emails": [
+      "~Hirotaka_Akita1",
+      "preferred.jp",
+      "~Katsuhiko_Ishiguro1"
+    ],
+    "rank": 2676
+  },
+  {
+    "url": "https://openreview.net/forum?id=GJkTaYTmzVS",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Contemporary coding education often present students with the task of developing programs that have user interaction and complex dynamic systems, such as mouse based games. While pedagogically compelling, grading such student programs requires dynamic user inputs, therefore they are difficult to grade by unit tests. In this paper we formalize the challenge of grading interactive programs as a task of classifying Markov Decision Processes (MDPs). Each student's program fully specifies an MDP where the agent needs to operate and decide, under reasonable generalization, if the dynamics and reward model of the input MDP conforms to a set of latent MDPs. We demonstrate that by experiencing a handful of latent MDPs millions of times, we can use the agent to sample trajectories from the input MDP and use a classifier to determine membership. Our method drastically reduces the amount of data needed to train an automatic grading system for interactive code assignments and present a challenge to state-of-the-art reinforcement learning generalization methods. Together with Code.org, we curated a dataset of 700k student submissions, one of the largest dataset of anonymized student submissions to a single assignment. This Code.org assignment had no previous solution for automatically providing correctness feedback to students and as such this contribution could lead to meaningful improvement in educational experience.",
+    "title": "Play to Grade: Grading Interactive Coding Games as Classifying Markov Decision Process",
+    "authors": [
+      "Allen Nie",
+      "Emma Brunskill",
+      "Chris Piech"
+    ],
+    "emails": [
+      "~Allen_Nie1",
+      "gmail.com",
+      "~Emma_Brunskill2"
+    ],
+    "rank": 2677
+  },
+  {
+    "url": "https://openreview.net/forum?id=UAAJMiVjTY_",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "For many reasoning-heavy tasks, it is challenging to find an appropriate end-to-end differentiable approximation to domain-specific inference mechanisms. Neural-Symbolic (NeSy) AI divides the end-to-end pipeline into neural perception and symbolic reasoning, which can directly exploit general domain knowledge such as algorithms and logic rules. However, it suffers from the exponential computational complexity caused by the interface between the two components, where the neural model lacks direct supervision, and the symbolic model lacks accurate input facts. As a result, they usually focus on learning the neural model with a sound and complete symbolic knowledge base while avoiding a crucial problem: where does the knowledge come from? In this paper, we present Abductive Meta-Interpretive Learning (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.153em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi><mi>e</mi><mi>t</mi><msub><mi>a</mi><mrow><mi>A</mi><mi>b</mi><mi>d</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>), which unites abduction and induction to learn perceptual neural network and first-order logic theories simultaneously from raw data. Given the same amount of domain knowledge, we demonstrate that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.153em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi><mi>e</mi><mi>t</mi><msub><mi>a</mi><mrow><mi>A</mi><mi>b</mi><mi>d</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> not only outperforms the compared end-to-end models in predictive accuracy and data efficiency but also induces logic programs that can be re-used as background knowledge in subsequent learning tasks. To the best of our knowledge, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.153em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D434 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>M</mi><mi>e</mi><mi>t</mi><msub><mi>a</mi><mrow><mi>A</mi><mi>b</mi><mi>d</mi></mrow></msub></math></mjx-assistive-mml></mjx-container> is the first system that can jointly learn neural networks and recursive first-order logic theories with predicate invention.",
+    "title": "Abductive Knowledge Induction from Raw Data",
+    "authors": [
+      "Wang-Zhou Dai",
+      "Stephen Muggleton"
+    ],
+    "emails": [
+      "~Wang-Zhou_Dai2",
+      "~Stephen_Muggleton1"
+    ],
+    "rank": 2678
+  },
+  {
+    "url": "https://openreview.net/forum?id=HfnQjEN_ZC",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      5,
+      3
+    ],
+    "abstract": "Smart watches are being increasingly used to detect human gestures and movements. Using a single smart watch, whole body movement recognition remains a hard problem because movements may not be adequately captured by the sensors in the watch. In this paper, we present a whole body movement detection study using a single smart watch in the context of ballroom dancing. Deep learning representations are used to classify well-defined sequences of movements, called \\emph{figures}. Those representations are found to outperform ensembles of random forests and hidden Markov models. The classification accuracy of 85.95\\% was improved to 92.31\\% by modeling a dance as a first-order Markov chain of figures.",
+    "title": "Ballroom Dance Movement Recognition Using a Smart Watch and Representation Learning",
+    "authors": [
+      "Varun Badrinath Krishna"
+    ],
+    "emails": [
+      "~Varun_Badrinath_Krishna1"
+    ],
+    "rank": 2679
+  },
+  {
+    "url": "https://openreview.net/forum?id=lJDESIaOh11",
+    "ratings": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Is it possible to develop an \"AI Pathologist\" to pass the board-certified examination of the American Board of Pathology (ABP)? To build such a system, three challenges need to be addressed. First, we need to create a visual question answering (VQA) dataset where the AI agent is presented with a pathology image together with a question and is asked to give the correct answer. Due to privacy concerns, pathology images are usually not publicly available. Besides, only well-trained pathologists can understand pathology images, but they barely have time to help create datasets for AI research. The second challenge is: due to the fact that it is difficult to hire highly experienced pathologists to create pathology visual questions and answers, the resulting pathology VQA dataset may contain errors such as some questions may not be relevant to the image or the answers are not given correctly. Training pathology VQA models using these noisy or even erroneous data will lead to problematic models that cannot generalize well on unseen images. The third challenge is: the medical concepts and knowledge covered in pathology question-answer (QA) pairs are very diverse while the number of QA pairs available for modeling training is limited. How to learn effective representations of diverse medical concepts based on limited data is technically demanding. In this paper, we aim to address these three challenges. To our best knowledge, our work represents the first one addressing the pathology VQA problem. To deal with the issue that a publicly available pathology VQA dataset is lacking, we create PathVQA, a VQA dataset with 32,795 questions asked from 4,998 pathology images. The questions in PathVQA are similar to those in the ABP tests. To our best knowledge, this is the first dataset for pathology VQA. To address the second challenge, we propose a learning-by-ignoring approach which automatically identifies training examples that have bad-quality and remove them from the training dataset. To address the third challenge, we propose to use crossmodal self-supervised learning to learn powerful visual and textual representations jointly. We perform experiments on our created PathVQA dataset and the results demonstrate the effectiveness of our proposed learning-by-ignoring method and cross-modal self-supervised learning methods.",
+    "title": "Pathological Visual Question Answering",
+    "authors": [
+      "Xuehai He",
+      "Zhuo Cai",
+      "Wenlan Wei",
+      "Yichen Zhang",
+      "Luntian Mou",
+      "Eric Xing",
+      "Pengtao Xie"
+    ],
+    "emails": [
+      "~Eric_Xing1",
+      "~Pengtao_Xie3",
+      "~Wenlan_Wei2",
+      "~Xuehai_He1",
+      "~Zhuo_Cai2",
+      "~Luntian_Mou1",
+      "~Yichen_Zhang1"
+    ],
+    "rank": 2680
+  },
+  {
+    "url": "https://openreview.net/forum?id=TDDZxmr6851",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "The choice of initial learning rate can have a profound effect on the performance of deep networks. We present empirical evidence that networks exhibit sharply distinct behaviors at small and large learning rates. In the small learning rate phase, training can be understood using the existing theory of infinitely wide neural networks. At large learning rates, we find that networks exhibit qualitatively distinct phenomena that cannot be explained by existing theory: The loss grows during the early part of training, and optimization eventually converges to a flatter minimum. Furthermore, we find that the optimal performance is often found in the large learning rate phase. To better understand this behavior we analyze the dynamics of a two-layer linear network and prove that it exhibits these different phases. We find good agreement between our analysis and the training dynamics observed in realistic deep learning settings. \n\n",
+    "title": "The large learning rate phase of deep learning",
+    "authors": [
+      "Aitor Lewkowycz",
+      "Yasaman Bahri",
+      "Ethan Dyer",
+      "Jascha Sohl-Dickstein",
+      "Guy Gur-Ari"
+    ],
+    "emails": [
+      "~Jascha_Sohl-Dickstein2",
+      "~Yasaman_Bahri1",
+      "~Guy_Gur-Ari1",
+      "~Ethan_Dyer1",
+      "~Aitor_Lewkowycz2"
+    ],
+    "rank": 2681
+  },
+  {
+    "url": "https://openreview.net/forum?id=yxB3sPaqlCZ",
+    "ratings": [
+      2,
+      5,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Recently there has been an increased interest in unsupervised learning of disentangled representations on the data generated from variation factors. Existing works rely on the assumption that the generative factors are independent despite this assumption is often violated in real-world scenarios. In this paper, we focus on the unsupervised learning of disentanglement in a general setting which the generative factors may be correlated. We propose an intervention-based framework to tackle this problem. In particular, first we apply a random intervention operation on a selected feature of the learnt image representation; then we propose a novel metric to measure the disentanglement by a downstream image translation task and prove it is consistent with existing ground-truth-required metrics experimentally; finally we design an end-to-end model to learn the disentangled representations with the self-supervision information from the downstream translation task. We evaluate our method on benchmark datasets quantitatively and give qualitative comparisons on a real-world dataset. Experiments show that our algorithm outperforms baselines on benchmark datasets when faced with correlated data and can disentangle semantic factors compared to baselines on real-world dataset.",
+    "title": "Unsupervised Disentanglement Learning by intervention",
+    "authors": [
+      "Weishen Pan",
+      "Sen Cui",
+      "Changshui Zhang"
+    ],
+    "emails": [
+      "mails.tsinghua.edu.cn",
+      "~Weishen_Pan1",
+      "~Changshui_Zhang2"
+    ],
+    "rank": 2682
+  },
+  {
+    "url": "https://openreview.net/forum?id=doeyA2PBjdy",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many methods aim to prune neural network to the maximum extent. However, there are few studies that investigate the pruning mechanism. In this work, we empirically investigate a standard framework for network pruning: pretraining large network and then pruning and retraining it. The framework has been commonly used based on heuristics, i.e., finding a good minima with a large network (pretraining phase) and retaining it with careful pruning and retraining (pruning and retraining phase). For the pretraining phase, the reason for which the large network is required to achieve good performance is examined. We hypothesize that this might come from the network relying on only a portion of its weights when trained from scratch. This way of weight utilization is referred to as imbalanced utility. The measures for weight utility and utility imbalance are proposed. We investigate the cause of the utility imbalance and the characteristics of the weight utility. For the pruning and retraining phase, whether the pruned-and-retrained network benefits from the pretrained network indded is examined. We visualize the accuracy surface of the pretrained, pruned and retrained networks and investigate the relation between them. The validation accuracy is also interpreted in association with the surface. ",
+    "title": "An empirical study of a pruning mechanism",
+    "authors": [
+      "Minju Jung",
+      "Hyounguk Shon",
+      "Eojindl Yi",
+      "SungHyun Baek",
+      "Junmo Kim"
+    ],
+    "emails": [
+      "~Junmo_Kim1",
+      "~SungHyun_Baek1",
+      "~Hyounguk_Shon2",
+      "~Minju_Jung2",
+      "~Eojindl_Yi1"
+    ],
+    "rank": 2683
+  },
+  {
+    "url": "https://openreview.net/forum?id=1Kxxduqpd3E",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "GradNorm (Chen et al., 2018) is a broadly used gradient-based approach for training multitask networks, where different tasks share, and thus compete during learning, for the network parameters. GradNorm eases the fitting of all individual tasks by dynamically equalizing the contribution of each task to the overall gradient magnitude. However, it does not prevent the individual tasks\u2019 gradients from conflicting, i.e., pointing towards opposite directions, and thus resulting in a poor multitask performance. In this work we propose Rotograd, an extension to GradNorm that addresses this problem by dynamically homogenizing not only the gradient magnitudes but also their directions across tasks. For this purpose,Rotograd adds a layer of task-specific rotation matrices that aligns all the task gradients. Importantly, we then analyze Rotograd (and its predecessor) through the lens of game theory, providing theoretical guarantees on the algorithm stability and convergence. Finally, our experiments on several real-world datasets and network architectures show that Rotograd outperforms previous approaches for multitask learning.\n\n",
+    "title": "Rotograd: Dynamic Gradient Homogenization for Multitask Learning",
+    "authors": [
+      "Adri\u00e1n Javaloy",
+      "Isabel Valera"
+    ],
+    "emails": [
+      "~Isabel_Valera1",
+      "~Adri\u00e1n_Javaloy1"
+    ],
+    "rank": 2684
+  },
+  {
+    "url": "https://openreview.net/forum?id=fMHwogGqTYs",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Current approaches for learning disentangled representations assume that independent latent variables generate the data through a single data generation process. In contrast, this manuscript considers independent causal mechanisms (ICM), which, unlike disentangled representations, directly model multiple data generation processes (mechanisms) in a coarse granularity. In this work, we aim to learn a model that disentangles each mechanism and approximates the ground-truth mechanisms from observational data. We outline sufficient conditions under which the mechanisms can be learned using a single self-supervised generative model with an unconventional mixture prior, simplifying previous methods. Moreover, we prove the identifiability of our model w.r.t. the mechanisms in the self-supervised scenario. We compare our approach to disentangled representations on various downstream tasks, showing that our approach is more robust to intervention, covariant shift, and noise due to the disentanglement between the data generation processes.",
+    "title": "Identifying Coarse-grained Independent Causal Mechanisms with Self-supervision",
+    "authors": [
+      "Xiaoyang Wang",
+      "Klara Nahrstedt",
+      "Oluwasanmi O Koyejo"
+    ],
+    "emails": [
+      "~Klara_Nahrstedt1",
+      "~Oluwasanmi_O_Koyejo1",
+      "~Xiaoyang_Wang6"
+    ],
+    "rank": 2685
+  },
+  {
+    "url": "https://openreview.net/forum?id=rQ55z6F-sY5",
+    "ratings": [
+      4,
+      4,
+      5,
+      2,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "For a given image, traditional supervised image classification using deep neural networks is akin to answering the question 'what object category does this image belong to?'. The model takes in an image as input and produces the most likely label for it. However, there is an alternate approach to arrive at the final answer which we investigate in this paper. We argue that, for any arbitrary category <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; margin-bottom: -0.208em;\"><mjx-mo class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c7E TEX-MI\"></mjx-c></mjx-mo></mjx-over><mjx-base style=\"padding-left: 0.042em;\"><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c79 TEX-MI\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mrow><mover><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">y</mi><mo data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\" stretchy=\"false\">~</mo></mover></mrow></mrow></math></mjx-assistive-mml></mjx-container>, the composed question 'Is this image of an object category <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; margin-bottom: -0.208em;\"><mjx-mo class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c7E TEX-MI\"></mjx-c></mjx-mo></mjx-over><mjx-base style=\"padding-left: 0.042em;\"><mjx-mi class=\"mjx-mit mjx-i\"><mjx-c class=\"mjx-c79 TEX-MI\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mrow><mover><mi data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\">y</mi><mo data-mjx-variant=\"-tex-mathit\" mathvariant=\"italic\" stretchy=\"false\">~</mo></mover></mrow></mrow></math></mjx-assistive-mml></mjx-container>' serves as a viable approach for image classification via. deep neural networks. The difference lies in the supplied additional information in form of the target along with the image. Motivated by the curiosity to unravel the advantages and limitations of the addressed approach, we propose Indicator Neural Networks(INN). It utilizes a pair of image and label as input and produces a boolean response. INN consists of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2</mn></math></mjx-assistive-mml></mjx-container> encoding components namely: label encoder and image encoder which learns latent representations for labels and images respectively. Predictor, the third component, combines the learnt individual label and image representations to make the final yes/no prediction. The network is trained end-to-end. We perform evaluations on image classification and fine-grained image classification datasets against strong baselines. We also investigate various components of INNs to understand their contribution in the final prediction of the model. Our probing of the modules reveals that, as opposed to traditionally trained deep counterpart, INN tends to much larger regions of the input image for generating the image features. The generated image feature is further refined by the generated label encoding prior to the final prediction.",
+    "title": "Exploring Target Driven Image Classification",
+    "authors": [
+      "Aditya Singh",
+      "Alessandro Bay",
+      "Andrea Mirabile"
+    ],
+    "emails": [
+      "~Alessandro_Bay1",
+      "~Aditya_Singh3",
+      "zebra.com"
+    ],
+    "rank": 2686
+  },
+  {
+    "url": "https://openreview.net/forum?id=lE1AB4stmX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "In this work we propose for the first time a transformer-based framework for unsupervised representation learning of multivariate time series. Pre-trained models can be potentially used for downstream tasks such as regression and classification, forecasting and missing value imputation. We evaluate our models on several benchmark datasets for multivariate time series regression and classification and show that they exceed current state-of-the-art performance, even when the number of training samples is very limited, while at the same time offering computational efficiency. We show that unsupervised pre-training of our transformer models offers a substantial performance benefit over fully supervised learning, even without leveraging additional unlabeled data, i.e., by reusing the same data samples through the unsupervised objective.",
+    "title": "A Transformer-based Framework for Multivariate Time Series Representation Learning",
+    "authors": [
+      "George Zerveas",
+      "Srideepika Jayaraman",
+      "Dhaval Patel",
+      "Anuradha Bhamidipaty",
+      "Carsten Eickhoff"
+    ],
+    "emails": [
+      "~George_Zerveas1",
+      "ibm.com",
+      "us.ibm.com",
+      "~Carsten_Eickhoff1"
+    ],
+    "rank": 2687
+  },
+  {
+    "url": "https://openreview.net/forum?id=_QdvdkxOii6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      5,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Sampled environment transitions are a critical input to deep reinforcement learning (DRL) algorithms. Current DRL benchmarks often allow for the cheap and easy generation of large amounts of samples such that perceived progress in DRL does not necessarily correspond to improved sample efficiency. As simulating real world processes is often prohibitively hard and collecting real world experience is costly, sample efficiency is an important indicator for economically relevant applications of DRL. We investigate progress in sample efficiency on Atari games and continuous control tasks by comparing the amount of samples that a variety of algorithms need to reach a given performance level according to training curves in the corresponding publications. We find exponential progress in sample efficiency with estimated doubling times of around 10 to 18 months on Atari, 5 to 24 months on state-based continuous control and of around 4 to 9 months on pixel-based continuous control depending on the specific task and performance level.",
+    "title": "Measuring Progress in Deep Reinforcement Learning Sample Efficiency ",
+    "authors": [
+      "Florian E. Dorner"
+    ],
+    "emails": [
+      "~Florian_E._Dorner1"
+    ],
+    "rank": 2688
+  },
+  {
+    "url": "https://openreview.net/forum?id=nRJ08rN_b17",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Object recognition is often viewed as a feedforward, bottom-up process in machine learning, but in real neural systems, object recognition is a complicated process which involves the interplay between two signal pathways. One is the parvocellular pathway (P-pathway), which is slow and extracts fine features of objects; the other is the magnocellular pathway (M-pathway), which is fast and extracts coarse features of objects. It has been suggested that the interplay between the two pathways endows the neural system with the capacity of processing visual information rapidly, adaptively, and robustly. However, the underlying computational mechanism remains largely unknown. In this study, we build a two-pathway model to elucidate the computational properties associated with the interactions between two visual pathways. The model consists of two convolution neural networks: one mimics the P-pathway, referred to as FineNet, which is deep, has small-size kernels, and receives detailed visual inputs; the other mimics the M-pathway, referred to as CoarseNet, which is shallow, has large-size kernels, and receives blurred visual inputs. \nThe two pathways interact with each other to facilitate information processing. Specifically, we show that CoarseNet can learn from FineNet through imitation to improve its performance considerably, and that through feedback from CoarseNet, the performnace of FineNet is improved and becomes robust to noises. Using visual backward masking as an example, we demonstrate that our model can explain visual cognitive behaviors that involve the interplay between two pathways. We hope that this study will provide insight into understanding visual information processing and inspire the development of new object recognition architectures in machine learning.",
+    "title": "Vision at A Glance: Interplay between Fine and Coarse Information Processing Pathways",
+    "authors": [
+      "Zilong Ji",
+      "Xiaolong Zou",
+      "Tiejun Huang",
+      "Si Wu"
+    ],
+    "emails": [
+      "~Tiejun_Huang1",
+      "~Si_Wu1",
+      "~Xiaolong_Zou1",
+      "~Zilong_Ji1"
+    ],
+    "rank": 2689
+  },
+  {
+    "url": "https://openreview.net/forum?id=xTV-wQ-pMrU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      2,
+      4
+    ],
+    "abstract": "Self-supervised pre-training using so-called \"pretext\" tasks has recently shown impressive performance across a wide range of tasks. In this work we advance self-supervised learning from permutations, that consists in shuffling parts of input and training a model to reorder them, improving downstream performance in classification. To do so, we overcome the main challenges of integrating permutation inversions (a discontinuous operation) into an end-to-end training scheme, heretofore sidestepped by casting the reordering task as classification, fundamentally reducing the space of permutations that can be exploited. These advances rely on two main, independent contributions. First, we use recent advances in differentiable ranking to integrate the permutation inversion flawlessly into a neural network, enabling us to use the full set of permutations, at no additional computing cost. Our experiments validate that learning from all possible permutations (up to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.393em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c><mjx-c class=\"mjx-c38\"></mjx-c></mjx-mn></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mn>10</mn><mrow><mn>18</mn></mrow></msup></math></mjx-assistive-mml></mjx-container>) improves the quality of the pre-trained representations over using a limited, fixed set. Second, we successfully demonstrate that inverting permutations is a meaningful pretext task in a diverse range of modalities, beyond images, which does not require modality-specific design. In particular, we also improve music understanding by reordering spectrogram patches in the frequency space, as well as video classification by reordering frames along the time axis. We furthermore analyze the influence of the patches that we use (vertical, horizontal, 2-dimensional), as well as the benefit of our approach in different data regimes. ",
+    "title": "Shuffle to Learn: Self-supervised learning from permutations via differentiable ranking",
+    "authors": [
+      "Andrew N Carr",
+      "Quentin Berthet",
+      "Mathieu Blondel",
+      "Olivier Teboul",
+      "Neil Zeghidour"
+    ],
+    "emails": [
+      "~Mathieu_Blondel1",
+      "~Andrew_N_Carr1",
+      "~Neil_Zeghidour1",
+      "~Quentin_Berthet2",
+      "~Olivier_Teboul2"
+    ],
+    "rank": 2690
+  },
+  {
+    "url": "https://openreview.net/forum?id=eHg0cXYigrT",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      7,
+      4,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "The availability of vast protein sequence information and rich functional annotations thereof has a large potential for protein design applications in biomedicine and synthetic biology. To this date, there exists no method for the general-purpose design of proteins without any prior knowledge about the protein of interest, such as costly and rare structure information or seed sequence fragments. However, the Gene Ontology (GO) database provides information about the hierarchical organisation of protein functions, and thus could inform generative models about the underlying complex sequence-function relationships, replacing the need for structural data. We therefore propose to use conditional generative adversarial networks (cGANs) on the task of fast de novo hierarchical multi-label protein design. We generate protein sequences exhibiting properties of a large set of molecular functions extracted from the GO database, using a single model and without any prior information. We shed light on efficient conditioning mechanisms and adapted network architectures thanks to a thorough hyperparameter selection process and analysis. We further provide statistically- and biologically-driven evaluation measures for generative models in the context of protein design to assess the quality of the generated sequences and facilitate progress in the field. We show that our proposed model, ProteoGAN, outperforms several baselines when designing proteins given a functional label and generates well-formed sequences.",
+    "title": "Conditional Generative Modeling for De Novo Hierarchical Multi-Label Functional Protein Design",
+    "authors": [
+      "Tim Kucera",
+      "Karsten Michael Borgwardt",
+      "Matteo Togninalli",
+      "Laetitia Papaxanthos"
+    ],
+    "emails": [
+      "bsse.ethz.ch",
+      "~Laetitia_Papaxanthos2",
+      "~Matteo_Togninalli1",
+      "~Tim_Kucera1"
+    ],
+    "rank": 2691
+  },
+  {
+    "url": "https://openreview.net/forum?id=bhngY7lHu_",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "The  definition  of  the  update  target  is  a  crucial  design  choice  in  reinforcement learning.  Due to the low computation cost and empirical high performance,  n-step returns with off-policy data is a widely used update target to bootstrap from scratch. A critical issue of applying n-step returns is to identify the optimal value of n.  In practice, n is often set to a fixed value, which is either determined by an empirical guess or by some hyper-parameter search.  In this work,  we point out that the optimal value of n actually differs on each data point, while the fixed value n is a rough average of them. The estimation error can be decomposed into two sources, off-policy bias and approximation error, and the fixed value of n is a trade-off between them.  Based on that observation, we introduce a new metric, policy age, to quantify the off-policyness of each data point.  We propose the Adaptive N-step Bootstrapping, which calculates the value of n for each data point by its policy age instead of the empirical guess. We conduct experiments on both MuJoCo and Atari games.  The results show that adaptive n-step bootstrap-ping achieves state-of-the-art performance in terms of both final reward and data efficiency.",
+    "title": "Adaptive N-step Bootstrapping with Off-policy Data",
+    "authors": [
+      "Guan Wang",
+      "Dong Yan",
+      "Hang Su",
+      "Jun Zhu"
+    ],
+    "emails": [
+      "~Dong_Yan1",
+      "~Jun_Zhu2",
+      "mails.tsinghua.edu.cn",
+      "~Hang_Su3"
+    ],
+    "rank": 2692
+  },
+  {
+    "url": "https://openreview.net/forum?id=_CrmWaJ2uvP",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      6,
+      3
+    ],
+    "rating": "4.00",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "While dynamic systems can be modelled  as sequence-to-sequence tasks  by deep learning using different network architectures like DNN, CNN, RNNs or neural ODEs, the resulting models often provide poor understanding of the underlying system properties. We propose a new recurrent network architecture, the Dynamic Recurrent Network, where the computation function is based on the discrete difference equations of basic linear system transfer functions known from dynamic system identification. This results in a more explainable model, since the learnt weights can provide insight on a system's time dependent  behaviour.  It also introduces the sequences' sampling rate as an additional model parameter, which can be leveraged, for example, for time series data augmentation and model robustness checks. The network is trained using traditional gradient descent optimization and can be used in combination with other state of the art neural network layers. We show that our new layer type yields results comparable to or better than other recurrent layer types on several system identification tasks.",
+    "title": "Recurrent Neural Network Architecture based on Dynamic Systems Theory for Data Driven Modelling of Complex Physical Systems",
+    "authors": [
+      "Deniz Neufeld"
+    ],
+    "emails": [
+      "~Deniz_Neufeld1"
+    ],
+    "rank": 2693
+  },
+  {
+    "url": "https://openreview.net/forum?id=6deUA11mOJ5",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "4.00",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Many recent developments on generative models for natural images have relied on heuristically-motivated metrics that can be easily gamed by memorizing a small sample from the true distribution or training a model directly to improve the metric.\nIn this work, we critically evaluate the gameability of the benchmarking procedure by running a competition which ultimately resulted in participants attempting to cheat. Our competition received over 11000 submitted models which allowed us to investigate memorization-aware metrics for measuring generative model performance. Specifically, we propose the Memorization-Informed Frechet Inception Distance (MiFID) and discuss ways to ensure that winning submissions were based on genuine improvements in perceptual quality. We evaluate the effectiveness of our benchmark by manually inspecting the code for the 1000 top-performing models and labeling different forms of memorization that were intentionally or unintentionally used. To facilitate future work on benchmarking generative models, we release generated images and our labels for these models as well as code to compute the MiFID metric.",
+    "title": "A Large-scale Study on Training Sample Memorization in Generative Modeling",
+    "authors": [
+      "Ching-Yuan Bai",
+      "Hsuan-Tien Lin",
+      "Colin Raffel",
+      "Wendy Kan"
+    ],
+    "emails": [
+      "~Wendy_Kan1",
+      "~Hsuan-Tien_Lin1",
+      "~Colin_Raffel1",
+      "csie.ntu.edu.tw"
+    ],
+    "rank": 2694
+  },
+  {
+    "url": "https://openreview.net/forum?id=DRc-o6DUGVf",
+    "ratings": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "rating": "3.95",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Few-shot learning problems require models to recognize novel classes with only a few supported samples. However, it remains challenging for the model to generalize novel classes with such limited samples. Driven by human behavior, researchers introduced semantic information (e.g. novel categories descriptions, label names, etc.) onto existing methods as prior knowledge to generalize more precise class representations. Despite the promising performance, these methods are under the assumption that users are able to provide precise semantic information for all target categories and this is hard to be satisfied in a real scenario. To address this problem, we proposed a novel Cross-modality Knowledge Enhancement Mechanism(CKEM) to discover task-relevant information in external semantic knowledge automatically. CKEM first utilizes Cross-modality Graph Builder(CGB) to align two unitary modality information (support labeled images and external semantic knowledge) into a cross-modality knowledge graph. After that, with the message-passing mechanism, CKEM selects and transfers relevant knowledge from external semantic knowledge bank to original visual-based class representations in Knowledge Fusion Model(KFM). Through a series of experiments, we show that our method improves the existing metric-based meta-learning methods with 1\\% - 5\\% for 1-shot and 5-shot settings on both mini-ImageNet and tiered-ImageNet datasets.",
+    "title": "cross-modal knowledge enhancement mechanism for few-shot learning",
+    "authors": [
+      "Haiyang Zhang",
+      "Jiaming Duan",
+      "liang liu"
+    ],
+    "emails": [
+      "~Jiaming_Duan1",
+      "bupt.edu.cn"
+    ],
+    "rank": 2695
+  },
+  {
+    "url": "https://openreview.net/forum?id=gMRZ4wLqlkJ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.94",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Federated learning (FL) presents an appealing opportunity for individuals who are willing to make their private data available for building a communal model without revealing their data contents to anyone else. Of central issues that may limit a widespread adoption of FL is the significant communication resources required in the exchange of updated model parameters between the server and individual clients over many communication rounds. In this work, we focus on limiting the number of model exchange rounds in FL to some small fixed number <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi></math></mjx-assistive-mml></mjx-container>, to control the communication burden. Following the spirit of meta-learning for few-shot learning, we take a meta-learning strategy to train the model so that once the meta-training phase is over, only <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D445 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>R</mi></math></mjx-assistive-mml></mjx-container> rounds of FL would produce a model that will satisfy the needs of all participating clients. A key advantage of employing meta-training is that the main labeled dataset used in training could differ significantly (e.g., different classes of images) from the actual data sample presented at inference time. Compared to the meta-training approaches to optimize personalized local models at distributed devices, our method better handles the potential lack of data variability at individual nodes. Extensive experimental results indicate that meta-training geared to few-round learning provide large performance improvements compared to various baselines.",
+    "title": "Few-Round Learning for Federated Learning",
+    "authors": [
+      "Younghyun Park",
+      "Dong-Jun Han",
+      "Do-Yeon Kim",
+      "Jun Seo",
+      "Jaekyun Moon"
+    ],
+    "emails": [
+      "~Jun_Seo1",
+      "~Jaekyun_Moon2",
+      "kaist.ac.kr",
+      "~Younghyun_Park1",
+      "~Dong-Jun_Han1"
+    ],
+    "rank": 2696
+  },
+  {
+    "url": "https://openreview.net/forum?id=OpUJ46CNv43",
+    "ratings": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.94",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Automated hyperparameter optimization (HPO) has shown great power in many machine learning applications. While existing methods suffer from model selection, parallelism, or sample efficiency, this paper presents a new HPO method, MOdular FActorial Design (MOFA), to address these issues simultaneously. The major idea is to use techniques from Experimental Designs to improve sample efficiency of model-free methods. Particularly, MOFA runs with four modules in each iteration: (1) an Orthogonal Latin Hypercube (OLH)-based sampler preserving both univariate projection uniformity and orthogonality; (2) a highly parallelized evaluator; (3) a transformer to collapse the OLH performance table into a specified Fractional Factorial Design--Orthogonal Array (OA); (4) an analyzer including Factorial Performance Analysis and Factorial Importance Analysis to narrow down the search space. We theoretically and empirically show that MOFA has great advantages over existing model-based and model-free methods. ",
+    "title": "MOFA: Modular Factorial Design for Hyperparameter Optimization",
+    "authors": [
+      "Bo Xiong",
+      "Yimin Huang",
+      "Steffen Staab",
+      "Zhenguo Li"
+    ],
+    "emails": [
+      "~Zhenguo_Li1",
+      "~Yimin_Huang2",
+      "~Steffen_Staab2",
+      "~Bo_Xiong3"
+    ],
+    "rank": 2697
+  },
+  {
+    "url": "https://openreview.net/forum?id=qYl0OtcZKx",
+    "ratings": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.94",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Data augmentation tuned to datasets and tasks has had great success in various AI applications, such as computer vision, natural language processing, autonomous driving, and bioinformatics. However, most of the specific parameter-based augmentation strategies are inefficient in finding suitable augmentation parameters to improve the model performance whenever the dataset changes. We introduce a dynamic data augmentation strategy called Faster and Smarter AutoAugment (FSAA) that separates the data augmentation method through the initial policy search result. Based on our policy branching principle, the augmentation policy dynamically clusters data points within the dataset according to the degree of performance change at the data split stage. With extensive experimentation on various datasets for image recognition task, we show that FSAA dramatically reduces the GPU computation cost, a problem in the existing automatic data augmentation strategies, by up to over 90%. It also ensures diversity and generalization of the dataset augmentation by searching for more diverse policies than existing autonomous methods in less time, resulting in consistent performance gains and robustness across multiple models.",
+    "title": "Faster and Smarter AutoAugment: Augmentation Policy Search Based on Dynamic Data-Clustering",
+    "authors": [
+      "Jonghyun Bae",
+      "Ji-Hoon Kim"
+    ],
+    "emails": [
+      "~Ji-Hoon_Kim2",
+      "~Jonghyun_Bae1"
+    ],
+    "rank": 2698
+  },
+  {
+    "url": "https://openreview.net/forum?id=PBfaUXYZzU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      6
+    ],
+    "rating": "3.94",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Class distribution skews in imbalanced datasets may lead to models with prediction bias towards majority classes, making fair assessment of classifiers a challenging task. Balanced Accuracy is a popular metric used to evaluate a classifier\u2019s prediction performance under such scenarios. However, this metric falls short when classes vary in importance, especially when class importance is skewed differently from class cardinality distributions. In this paper, we propose a simple and general-purpose evaluation framework for imbalanced data classification that is sensitive to arbitrary skews in class cardinalities and importances. Experiments with several state-of-the-art classifiers tested on real-world datasets and benchmarks from two different domains show that our new framework is more effective than Balanced Accuracy \u2013- not only in evaluating and ranking model predictions, but also in training the models themselves.",
+    "title": "Class-Weighted Evaluation Metrics for Imbalanced Data Classification",
+    "authors": [
+      "Akhilesh Gupta",
+      "Nesime Tatbul",
+      "Ryan Marcus",
+      "Shengtian Zhou",
+      "Insup Lee",
+      "Justin Gottschlich"
+    ],
+    "emails": [
+      "~Ryan_Marcus1",
+      "~Shengtian_Zhou1",
+      "alumni.upenn.edu",
+      "~Nesime_Tatbul1",
+      "~Insup_Lee1",
+      "~Justin_Gottschlich1"
+    ],
+    "rank": 2699
+  },
+  {
+    "url": "https://openreview.net/forum?id=ep81NLpHeos",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.94",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Wasserstein autoencoder (WAE) shows that matching two distributions is equivalent to minimizing a simple autoencoder (AE) loss under the constraint that the latent space of this AE matches a pre-specified prior distribution. This latent space distribution matching is a core component in WAE, and is in itself a challenging task. In this paper, we propose to use the contrastive learning framework that has been shown to be effective for self-supervised representation learning, as a means to resolve this problem. We do so by exploiting the fact that contrastive learning objectives optimize the latent space distribution to be uniform over the unit hyper-sphere, which can be easily sampled from. This results in a simple and scalable algorithm that avoids many of the optimization challenges of existing generative models, while retaining the advantage of efficient sampling. Quantitatively, we show that our algorithm achieves a new state-of-the-art FID of 54.36 on CIFAR-10, and performs competitively with existing models on CelebA in terms of FID score. We also show qualitative results on CelebA-HQ in addition to these datasets, confirming that our algorithm can generate realistic images at multiple resolutions.",
+    "title": "Momentum Contrastive Autoencoder",
+    "authors": [
+      "Devansh Arpit",
+      "Aadyot Bhatnagar",
+      "Huan Wang",
+      "Caiming Xiong"
+    ],
+    "emails": [
+      "~Devansh_Arpit2",
+      "~Huan_Wang1",
+      "~Caiming_Xiong1",
+      "salesforce.com"
+    ],
+    "rank": 2700
+  },
+  {
+    "url": "https://openreview.net/forum?id=0naHZ3gZSzo",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.94",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Modern machine learning algorithms usually involve tuning multiple (from one to  thousands) hyperparameters which play a pivotal role  in terms of model  generalizability. Globally choosing appropriate values of hyperparameters is extremely  computationally challenging. Black-box optimization and gradient-based algorithms are  two dominant approaches to hyperparameter optimization while  they have totally  distinct advantages. How to design a new hyperparameter optimization technique inheriting all benefits  from both approaches is still an open problem. To address this challenging problem, in this paper, we  propose a new  hyperparameter optimization  method with zeroth-order hyper-gradients (HOZOG). Specifically, we first exactly formulate  hyperparameter optimization  as  an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c41 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">A</mi></mrow></math></mjx-assistive-mml></mjx-container>-based constrained optimization problem, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-cal mjx-i\"><mjx-c class=\"mjx-c41 TEX-C\"></mjx-c></mjx-mi></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mi data-mjx-variant=\"-tex-calligraphic\" mathvariant=\"script\">A</mi></mrow></math></mjx-assistive-mml></mjx-container> is  a black-box optimization algorithm (such as deep neural network). Then, we use the average zeroth-order hyper-gradients  to update  hyperparameters. We provide the feasibility analysis of using  HOZOG to achieve hyperparameter optimization.  The experimental  results on three representative  hyperparameter (the size is from 1 to 1250) optimization tasks  demonstrate the benefits of HOZOG  in terms of  \\textit{simplicity, scalability, flexibility, effectiveness and efficiency} compared with the  state-of-the-art hyperparameter optimization methods.",
+    "title": "Optimizing Large-Scale Hyperparameters via Automated Learning Algorithm",
+    "authors": [
+      "Bin Gu",
+      "Guodong Liu",
+      "Yanfu Zhang",
+      "Xiang Geng",
+      "Heng Huang"
+    ],
+    "emails": [
+      "~Yanfu_Zhang1",
+      "~Guodong_Liu2",
+      "~Xiang_Geng1",
+      "~Heng_Huang1",
+      "~Bin_Gu1"
+    ],
+    "rank": 2701
+  },
+  {
+    "url": "https://openreview.net/forum?id=l-kqekaFvI9",
+    "ratings": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.94",
+    "confidences": [
+      4,
+      5,
+      5,
+      2
+    ],
+    "abstract": "Unsupervised domain adaptation has attracted increasing attention in recent years, which adapts classifiers to an unlabeled target domain by exploiting a labeled source domain. To reduce discrepancy between source and target domains, adversarial learning methods are typically selected to seek domain-invariant representations by confusing the domain discriminator. However, classifiers may not be well adapted to such a domain-invariant representation space, as the sample-level and class-level data structures could be distorted during adversarial learning. In this paper, we propose a novel Transferable Feature Learning approach on Graphs (TFLG) for unsupervised adversarial domain adaptation, which jointly incorporates sample-level and class-level structure information across two domains. TFLG first constructs graphs for mini-batch samples, and identifies the class-wise correspondence across domains. A novel cross-domain graph convolutional operation is designed to jointly align the sample-level and class-level structures in two domains. Moreover, a memory bank is designed to further exploit the class-level information. Extensive experiments on benchmark datasets demonstrate the effectiveness of our approach, compared to the representative unsupervised domain adaptation methods.",
+    "title": "Transferable Feature Learning on Graphs Across Visual Domains",
+    "authors": [
+      "Ronghang Zhu",
+      "Xiaodong Jiang",
+      "Jiasen Lu",
+      "Sheng Li"
+    ],
+    "emails": [
+      "~Ronghang_Zhu2",
+      "~Xiaodong_Jiang1",
+      "~Sheng_Li3",
+      "~Jiasen_Lu2"
+    ],
+    "rank": 2702
+  },
+  {
+    "url": "https://openreview.net/forum?id=20qC5K2ICZL",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.94",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Learning robust deep models against noisy labels becomes ever critical when today's data is commonly collected from open platforms and subject to adversarial corruption. The information on the label corruption process, i.e., corruption matrix, can greatly enhance the robustness of deep models but still fall behind in combating hard classes. In this paper, we propose to construct a golden symmetric loss (GSL) based on the estimated confusion matrix as to avoid overfitting to noisy labels and learn effectively from hard classes. GSL is the weighted sum of the corrected regular cross entropy and reverse cross entropy. By leveraging a small fraction of trusted clean data, we estimate the corruption matrix and use it to correct the loss as well as to determine the weights of GSL. We theoretically prove the robustness of the proposed loss function in the presence of dirty labels. We provide a heuristics to adaptively tune the loss weights of GSL according to the noise rate and diversity measured from the dataset. We evaluate our proposed golden symmetric loss on both vision and natural language deep models subject to different types of label noise patterns. Empirical results show that GSL can significantly outperform the existing robust training methods on different noise patterns, showing accuracy improvement up to 18% on CIFAR-100 and 1% on real world noisy dataset of Clothing1M. ",
+    "title": "Robust Learning via Golden Symmetric Loss of (un)Trusted Labels",
+    "authors": [
+      "Amirmasoud Ghiassi",
+      "Robert Birke",
+      "Lydia Y. Chen"
+    ],
+    "emails": [
+      "~Amirmasoud_Ghiassi1",
+      "~Lydia_Y._Chen1",
+      "~Robert_Birke1"
+    ],
+    "rank": 2703
+  },
+  {
+    "url": "https://openreview.net/forum?id=R3a2G2tSf3c",
+    "ratings": [
+      2,
+      5,
+      4,
+      5
+    ],
+    "rating": "3.94",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Graph classification aims to predict the class label for an entire graph. Recently, Graph Neural Networks (GNNs)-based approaches become an essential strand to learn low-dimensional continuous embeddings of the entire graphs for graph label prediction. While GNNs explicitly aggregate the neighborhood information and implicitly capture the topological structure for graph representation, they ignore the relationships among graphs. In this paper, we propose a Graph-Graph Similarity Network to tackle the graph classification problem by constructing a SuperGraph through learning the relationships among graphs. Each node in the SuperGraph represents an input graph, and the weights of edges denote the similarity between graphs. By this means, the graph classification is then transformed into a classical node classification problem. Specifically, we employ an Adversarial Autoencoder to align embeddings of all the graphs to a same distribution. After the alignment, we design the Graph-Graph Similarity Network to learn the similarity between graphs, which function as the adjacency matrix of the SuperGraph. By running node classification algorithms on the SuperGraph, we can predict the labels of graphs. Experiments on five widely used benchmarks under a fair setting demonstrate the effectiveness of our method.",
+    "title": "Graph-Graph Similarity Network",
+    "authors": [
+      "Han Yue",
+      "Pengyu Hong",
+      "Hongfu Liu"
+    ],
+    "emails": [
+      "~Pengyu_Hong1",
+      "~Han_Yue2",
+      "~Hongfu_Liu2"
+    ],
+    "rank": 2704
+  },
+  {
+    "url": "https://openreview.net/forum?id=EohGx2HgNsA",
+    "ratings": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.94",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Neural Architecture Search (NAS) is one of the focal points for the Deep Learning community, but reproducing NAS methods is extremely challenging due to numerous low-level implementation details. To alleviate this problem we introduce NASLib, a NAS library built upon PyTorch. This framework offers high-level abstractions for designing and reusing search spaces, interfaces to benchmarks and evaluation pipelines, enabling the implementation and extension of state-of-the-art NAS methods with a few lines of code. The modularized nature of NASlib  allows researchers to easily innovate on individual components (e.g., define a new search space while reusing an optimizer and evaluation pipeline, or propose a new optimizer with existing search spaces). As a result, NASLib has the potential to facilitate NAS research by allowing fast advances and evaluations that are by design free of confounding factors. To demonstrate that NASLib is a sound library, we implement and achieve state-of-the-art results with one-shot NAS optimizers (DARTS and GDAS) over the DARTS search space and the popular NAS-Bench-201 benchmark. Last but not least, we showcase how easily novel approaches are coded in NASLib, by training DARTS on a hierarchical search space.",
+    "title": "NASLib: A Modular and Flexible Neural Architecture Search Library",
+    "authors": [
+      "Michael Ruchte",
+      "Arber Zela",
+      "Julien Niklas Siems",
+      "Josif Grabocka",
+      "Frank Hutter"
+    ],
+    "emails": [
+      "~Michael_Ruchte1",
+      "~Josif_Grabocka1",
+      "~Frank_Hutter1",
+      "~Arber_Zela1",
+      "~Julien_Niklas_Siems1"
+    ],
+    "rank": 2705
+  },
+  {
+    "url": "https://openreview.net/forum?id=rq3Rt9R5rD",
+    "ratings": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "rating": "3.94",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "State-of-the-art image-to-image translation methods tend to struggle in an imbalanced domain setting, where one image domain lacks richness and diversity. We introduce a new unsupervised translation network, BalaGAN, specifically designed to tackle the domain imbalance problem. We leverage the latent modalities of the richer domain to turn the image-to-image translation problem, between two imbalanced domains, into a balanced, multi-class, and conditional translation problem, more resembling the style transfer setting. Specifically, we analyze the source domain and learn a decomposition of it into a set of latent modes or classes, without any supervision. This leaves us with a multitude of balanced cross-domain translation tasks, between all pairs of classes, including the target domain. During inference, the trained network takes as input a source image, as well as a reference or style image from one of the modes as a condition, and produces an image which resembles the source on the pixel-wise level, but shares the same mode as the reference. We show that employing modalities within the dataset improves the quality of the translated images, and that BalaGAN outperforms strong baselines of both unconditioned and style-transfer-based image-to-image translation methods, in terms of image quality and diversity.",
+    "title": "BalaGAN: Image Translation Between Imbalanced Domains via Cross-Modal Transfer",
+    "authors": [
+      "Or Patashnik",
+      "Dov Danon",
+      "Hao Zhang",
+      "Daniel Cohen-or"
+    ],
+    "emails": [
+      "~Dov_Danon1",
+      "~Daniel_Cohen-or2",
+      "~Hao_Zhang25",
+      "~Or_Patashnik1"
+    ],
+    "rank": 2706
+  },
+  {
+    "url": "https://openreview.net/forum?id=BVPowUU1cR",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.93",
+    "confidences": [
+      2,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Some of the most popular methods for improving the stability and performance of GANs involve constraining or regularizing the discriminator. In this paper we consider a largely overlooked regularization technique which we refer to as the Adversary's Assistant (AdvAs). We motivate this using a different perspective to that of prior work. Specifically, we consider a common mismatch between theoretical analysis and practice: analysis often assumes that the discriminator reaches its optimum on each iteration. In practice, this is essentially never true, often leading to poor gradient estimates for the generator. To address this, AdvAs is a theoretically motivated penalty imposed on the generator based on the norm of the gradients used to train the discriminator. This encourages the generator to move towards points where the discriminator is optimal. We demonstrate the effect of applying AdvAs to several GAN objectives, datasets and network architectures. The results indicate a reduction in the mismatch between theory and practice and that AdvAs can lead to improvement of GAN training, as measured by FID scores.\n",
+    "title": "Assisting the Adversary to Improve GAN Training",
+    "authors": [
+      "Andreas Munk",
+      "William Harvey",
+      "Frank Wood"
+    ],
+    "emails": [
+      "~Andreas_Munk1",
+      "~Frank_Wood2",
+      "~William_Harvey1"
+    ],
+    "rank": 2707
+  },
+  {
+    "url": "https://openreview.net/forum?id=e3bhF_p0T7c",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.93",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Recent theoretical analysis suggests that ultra-wide neural networks always converge to global minima near the initialization under first order methods. However, the convergence property of neural networks with finite width could be very different. The simplest experiment with two-layer teacher-student networks shows that the input weights of student neurons eventually align with one of the teacher neurons. This suggests a distinct convergence nature for ``not-too-wide'' neural networks that there might not be any local minima near the initialization. As the theoretical justification, we prove that under the most basic settings, all student neurons must align with the teacher neuron at any local minima. The methodology is extendable to more general cases, where the proof can be reduced to analyzing the properties of a special class of functions that we call {\\em Angular Distance (AD) function}. Finally, we demonstrate that these properties can be easily verified numerically.",
+    "title": "Analysis of Alignment Phenomenon in Simple Teacher-student Networks with Finite Width",
+    "authors": [
+      "Hanlin Zhu",
+      "Chengyang Ying",
+      "Song Zuo"
+    ],
+    "emails": [
+      "~Chengyang_Ying2",
+      "~Hanlin_Zhu2",
+      "~Song_Zuo1"
+    ],
+    "rank": 2708
+  },
+  {
+    "url": "https://openreview.net/forum?id=rqzZDh8jqGj",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "rating": "3.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Abstract The impressive performance exhibited by modern machine learning models hinges on the ability to train such models on a very large amounts of labeled data. However, since access to large volumes of labeled data is often limited or expensive, it is desirable to alleviate this bottleneck by carefully curating the training set. Optimal experimental design is a well-established paradigm for selecting data point to be labeled so to maximally inform the learning process. Unfortunately, classical theory on optimal experimental design focuses on selecting examples in order to learn underparameterized (and thus, non-interpolative) models, while modern machine learning models such as deep neural networks are overparameterized, and oftentimes are trained to be interpolative. As such, classical experimental design methods are not applicable in many modern learning setups. Indeed, the predictive performance of underparameterized models tends to be variance dominated, so classical experimental design focuses on variance reduction, while the predictive performance of overparameterized models can also be, as is shown in this paper, bias dominated or of mixed nature. In this paper we propose a design strategy that is well suited for overparameterized regression and interpolation, and we demonstrate the applicability of our method in the context of deep learning by proposing a new algorithm for single shot deep active learning.",
+    "title": "Experimental Design for Overparameterized Learning with Application to Single Shot Deep Active Learning",
+    "authors": [
+      "Neta Shoham",
+      "Haim Avron"
+    ],
+    "emails": [
+      "~Neta_Shoham1",
+      "~Haim_Avron4"
+    ],
+    "rank": 2709
+  },
+  {
+    "url": "https://openreview.net/forum?id=ToWi1RjuEr8",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      6
+    ],
+    "rating": "3.93",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "In this work, we aim to develop a simple and scalable reinforcement learning algorithm that uses standard supervised learning methods as subroutines, while also being able to leverage off-policy data. Our proposed approach, which we refer to as advantage-weighted regression (AWR), consists of two standard supervised learning steps: one to regress onto target values for a value function, and another to regress onto weighted target actions for the policy. The method is simple and general, can accommodate continuous and discrete actions, and can be implemented in just a few lines of code on top of standard supervised learning methods. We provide a theoretical motivation for AWR and analyze its properties when incorporating off-policy data from experience replay. We evaluate AWR on a suite of standard OpenAI Gym benchmark tasks, and show that it achieves competitive performance compared to a number of well-established state-of-the-art RL algorithms. AWR is also able to acquire more effective policies than most off-policy algorithms when learning from purely static datasets with no additional environmental interactions. Furthermore, we demonstrate our algorithm on challenging continuous control tasks with highly complex simulated characters.",
+    "title": "Advantage-Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning",
+    "authors": [
+      "Xue Bin Peng",
+      "Aviral Kumar",
+      "Grace Zhang",
+      "Sergey Levine"
+    ],
+    "emails": [
+      "~Xue_Bin_Peng1",
+      "~Aviral_Kumar2",
+      "~Sergey_Levine1",
+      "berkeley.edu"
+    ],
+    "rank": 2710
+  },
+  {
+    "url": "https://openreview.net/forum?id=Sva-fwURywB",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3
+    ],
+    "rating": "3.92",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "In this paper, we propose a novel approach to efficiently learning disentangled representations with causal mechanisms, based on the difference of conditional probabilities in original and new distributions. We approximate the difference with model's generalization abilities so that it fits in standard machine learning framework and can be efficiently computed. In contrast to the state-of-the-art approach, which relies on learner's adaptation speed to new distribution, the proposed approach only requires evaluating the generalization ability of the model. We provide theoretical explanation for the advantage of the proposed method, and our experiments show that the proposed technique is 1.9-11.0x more sample efficient and 9.4-32.4x quicker than the previous method on various tasks.",
+    "title": "Efficiently Disentangle Causal Representations",
+    "authors": [
+      "Yuanpeng Li",
+      "Joel Hestness",
+      "Mohamed Elhoseiny",
+      "Liang Zhao",
+      "Kenneth Church"
+    ],
+    "emails": [
+      "~Joel_Hestness2",
+      "~Yuanpeng_Li2",
+      "~Kenneth_Church1",
+      "~Liang_Zhao2",
+      "~Mohamed_Elhoseiny1"
+    ],
+    "rank": 2711
+  },
+  {
+    "url": "https://openreview.net/forum?id=3YQAVD9_Dz3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5
+    ],
+    "rating": "3.92",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "Data augmentation has been widely used for enhancing the diversity of training data and model generalization. Different from traditional handcrafted methods, recent research introduced automated search for optimal data augmentation policies and achieved state-of-the-art results on image classification tasks. However, these search-based implementations typically incur high computation cost and long search time because of large search spaces and complex searching algorithms. We revisited automated augmentation from alternate perspectives, such as increasing diversity and manipulating the overall usage of augmented data. In this paper, we present an augmentation method without policy searching called NOSE Augment (NO SEarch Augment). Our method completely skips policy searching; instead, it jointly applies multi-stage augmentation strategy and introduces more augmentation operations on top of a simple stochastic augmentation mechanism. With more augmentation operations, we boost the data diversity of stochastic augmentation; and with the phased complexity driven strategy, we ensure the whole training process converged smoothly to a good quality model. We conducted extensive experiments and showed that our method could match or surpass state-of-the-art results provided by search-based methods in terms of accuracies. Without the need for policy search, our method is much more efficient than the existing AutoAugment series of methods. Besides image classification, we also examine the general validity of our proposed method by applying our method to Face Recognition and Text Detection of the Optical Character Recognition (OCR) problems. The results establish our proposed method as a fast and competitive data augmentation strategy that can be used across various CV tasks.",
+    "title": "NOSE Augment: Fast and Effective Data Augmentation Without Searching",
+    "authors": [
+      "Qingrui Li",
+      "Song Xie",
+      "An\u0131l Oymagil",
+      "Mustafa Furkan Eseoglu",
+      "Ziyin Zhang",
+      "CM Lee"
+    ],
+    "emails": [
+      "mail.nwpu.edu.cn",
+      "~CM_Lee1",
+      "huawei.com",
+      "~Mustafa_Furkan_Eseoglu1",
+      "~Qingrui_Li1",
+      "~An\u0131l_Oymagil1"
+    ],
+    "rank": 2712
+  },
+  {
+    "url": "https://openreview.net/forum?id=Shjmp-QK8Y-",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3
+    ],
+    "rating": "3.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Self-attention networks (SANs) have shown promising empirical results in various natural language processing tasks. Typically, it gradually learning language knowledge on the whole training dataset in parallel and stacked ways, thereby modeling language representation. In this paper, we propose a simple and general representation method to consider prior knowledge related to language representation from the beginning of training. Also, the proposed method allows SANs to leverage prior knowledge in a universal way compatible with neural networks. Furthermore, we apply it to one prior word frequency knowledge for the monolingual data and other prior translation lexicon knowledge for the bilingual data, respectively, thereby enhancing the language representation. Experimental results on WMT14 English-to-German and WMT17 Chinese-to-English translation tasks demonstrate the effectiveness and universality of the proposed method over a strong Transformer-based baseline. ",
+    "title": "Prior Knowledge Representation for Self-Attention Networks",
+    "authors": [
+      "Kehai Chen",
+      "Rui Wang",
+      "Masao Utiyama",
+      "Eiichiro Sumita"
+    ],
+    "emails": [
+      "~Masao_Utiyama2",
+      "~Rui_Wang10",
+      "~Eiichiro_Sumita1",
+      "~Kehai_Chen2"
+    ],
+    "rank": 2713
+  },
+  {
+    "url": "https://openreview.net/forum?id=KVTkzgz3g8O",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3
+    ],
+    "rating": "3.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "We present TraDE, a self-attention-based architecture for auto-regressive density estimation with continuous and discrete valued  data. Our model is trained using a penalized maximum likelihood objective, which  ensures that samples from the density estimate resemble the training data distribution. The use of self-attention means that the model need not retain conditional sufficient statistics during the auto-regressive process beyond what is needed for each covariate. On standard tabular and image data  benchmarks, TraDE produces significantly better density estimates than existing approaches such as normalizing flow estimators and recurrent auto-regressive models. However log-likelihood on held-out data only partially reflects how useful these estimates are in real-world applications. In order to systematically evaluate density estimators, we present a suite of tasks such as regression using generated samples, out-of-distribution detection, and robustness to noise in the training data and demonstrate that TraDE works well in these scenarios. ",
+    "title": "TraDE: A Simple Self-Attention-Based Density Estimator",
+    "authors": [
+      "Rasool Fakoor",
+      "Pratik Anil Chaudhari",
+      "Jonas Mueller",
+      "Alex Smola"
+    ],
+    "emails": [
+      "~Jonas_Mueller1",
+      "~Pratik_Anil_Chaudhari1",
+      "~Rasool_Fakoor1",
+      "~Alex_Smola1"
+    ],
+    "rank": 2714
+  },
+  {
+    "url": "https://openreview.net/forum?id=WfY0jNndSn3",
+    "ratings": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "rating": "3.92",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Commonsense question answering (QA) requires to mine the clues in the context to reason the answer to a question, and is a central task in natural language processing. Despite the advances of current pre-trained models, e.g. BERT, they often learn artifactual causality between the clues in context and the question because of similar but artifactual clues or highly frequent question-clue pairs in training data. To solve this issue, we propose a novel DynamIc Self-sUperviSed Erasure (DISUSE) which adaptively erases redundant and artifactual clues in the context and questions to learn and establish the correct corresponding pair relations between the questions and their clues. Specifically, DISUSE contains an \\textit{erasure sampler} and a \\textit{supervisor}.\nThe erasure sampler estimates the correlation scores   between all clues and the question in an attention manner, and then erases each clue (object in image or word in question and context) according to the probability which inversely depends on its correlation score. In this way, the redundant and artifactual clues to the current question are removed, while necessary and important clues are preserved. Then the supervisor evaluates current erasure performance by inspecting whether the erased sample and its corresponding vanilla sample have consistent answer prediction distribution, and supervises the KL divergence between these two answer prediction distributions to progressively improve erasure quality in a self-supervised manner. As a result, DISUSE can learn and establish more precise corresponding question-clue pairs, and thus gives more precise answers of new questions in present of their contexts via reasoning the key and correct corresponding clues to the questions. Extensive experiment results on the RC dataset (ReClor) and VQA datasets (GQA and VQA 2.0) demonstrate the superiority of our DISUSE over the state-of-the-arts.",
+    "title": "Erasure for Advancing: Dynamic Self-Supervised Learning for Commonsense Reasoning",
+    "authors": [
+      "Fuyu Wang",
+      "Pan Zhou",
+      "Xiaodan Liang",
+      "Liang Lin"
+    ],
+    "emails": [
+      "~Fuyu_Wang1",
+      "~Liang_Lin1",
+      "~Xiaodan_Liang2",
+      "~Pan_Zhou3"
+    ],
+    "rank": 2715
+  },
+  {
+    "url": "https://openreview.net/forum?id=TWDczblpqE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "3.92",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "Honey bees are critical to our ecosystem and food security as a pollinator, contributing 35% of our global agriculture yield. In spite of their importance, beekeeping is exclusively dependent on human labor and experience-derived heuristics, while requiring frequent human checkups to ensure the colony is healthy, which can disrupt the colony. Increasingly, pollinator populations are declining due to threats from climate change, pests, environmental toxicity, making their management even more critical than ever before in order to ensure sustained global food security. To start addressing this pressing challenge, we developed an integrated hardware sensing system for beehive monitoring through audio and environment measurements, and a hierarchical semi-supervised deep learning model, composed of an audio modeling module and a predictor, to model the strength of beehives. The model is trained jointly on audio reconstruction and prediction losses based on human inspections, in order to model both low-level audio features and circadian temporal dynamics. We show that this model performs well despite limited labels, and can learn an audio embedding that is useful for characterizing different sound profiles of beehives. This is the first instance to our knowledge of applying audio-based deep learning to model beehives and population size in an observational setting across a large number of hives.",
+    "title": "Semi-Supervised Audio Representation Learning for Modeling Beehive Strengths",
+    "authors": [
+      "Tony Zhang",
+      "Szymon Zmyslony",
+      "Sergei Nozdrenkov",
+      "Matthew Smith",
+      "Brandon Kingsley Hopkins"
+    ],
+    "emails": [
+      "~Tony_Zhang2",
+      "google.com",
+      "wsu.edu"
+    ],
+    "rank": 2716
+  },
+  {
+    "url": "https://openreview.net/forum?id=kq4SNxgQI4v",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4
+    ],
+    "rating": "3.92",
+    "confidences": [
+      5,
+      4,
+      4
+    ],
+    "abstract": "Prior word alignment has been shown indeed helpful for a better translation if such prior is good enough and can be acquired in a convenient way at the same time. Traditionally, word alignment can be learned through statistical machine translation (SMT) models. In this paper, we propose a novel method that infuses prior word alignment information into neural machine translation (NMT) to provide hints or guidelines for the target sentence at running time. To this end, previous works of similar approaches should build dictionaries for specific domains, or constraint the decoding process, or both. While being effective to some extent, these methods may greatly affect decoding speed and hurt translation flexibility and efficiency.  Instead, this paper introduces an enhancement learning model, which can learn how to directly replace specific source words with their target counterparts according to prior alignment information. The proposed model is then inserted into a neural MT model and augments MT input with the additional target information from the learning model in an effective and more efficient way. Our novel method achieves BLEU improvements (up to 1.1) over a strong baseline model on English-Korean and English-Romanian translation tasks.",
+    "title": "Efficient Neural Machine Translation with Prior Word Alignment",
+    "authors": [
+      "Jeonghyeok Park",
+      "hai zhao"
+    ],
+    "emails": [
+      "~hai_zhao1",
+      "~Jeonghyeok_Park2"
+    ],
+    "rank": 2717
+  },
+  {
+    "url": "https://openreview.net/forum?id=14nC8HNd4Ts",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3
+    ],
+    "rating": "3.92",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Calcium imaging has become a powerful and popular technique to monitor the activity of large populations of neurons in vivo. However, for ethical considerations and despite recent technical developments, recordings are still constrained to a limited number of trials and animals. This limits the amount of data available from individual experiments and hinders the development of analysis techniques and models for more realistic sizes of neuronal populations. The ability to artificially synthesize realistic neuronal calcium signals could greatly alleviate this problem by scaling up the number of trials. Here, we propose a Generative Adversarial Network (GAN) model to generate realistic calcium signals as seen in neuronal somata with calcium imaging. To this end, we propose CalciumGAN, a model based on the WaveGAN architecture and train it on calcium fluorescent signals with the Wasserstein distance. We test the model on artificial data with known ground-truth and show that the distribution of the generated signals closely resembles the underlying data distribution. Then, we train the model on real calcium traces recorded from the primary visual cortex of behaving mice and confirm that the deconvolved spike trains match the statistics of the recorded data. Together, these results demonstrate that our model can successfully generate realistic calcium traces, thereby providing the means to augment existing datasets of neuronal activity for enhanced data exploration and modelling.",
+    "title": "Synthesising Realistic Calcium Traces of Neuronal Populations Using GAN",
+    "authors": [
+      "Bryan M. Li",
+      "Theoklitos Amvrosiadis",
+      "Nathalie Rochefort",
+      "Arno Onken"
+    ],
+    "emails": [
+      "ed.ac.uk",
+      "~Arno_Onken1",
+      "~Bryan_M._Li1"
+    ],
+    "rank": 2718
+  },
+  {
+    "url": "https://openreview.net/forum?id=pCTSbMFIeIS",
+    "ratings": [
+      4,
+      3,
+      5
+    ],
+    "rating": "3.91",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "What mechanisms causes GAN's entanglement? Although developing disentangled GAN has attracted sufficient attention, it is unclear how entanglement is originated by GAN transformation. We in this research propose a difference-in-difference (DID) counterfactual framework to design experiments for analyzing the entanglement mechanism in on of the Progressive-growing GAN  (PG-GAN). Our experiment clarify the mechanisms how pixel normalization causes PG-GAN entanglement during a  input-unit-ablation transformation. We discover that pixel normalization causes object entanglement by in-painting the area occupied by ablated objects. We also discover the unit-object relation determines whether and how pixel normalization causes objects entanglement. Our DID framework theoretically guarantees that the mechanisms that we discover is solid, explainable and comprehensively.",
+    "title": "Difference-in-Differences: Bridging Normalization and Disentanglement in PG-GAN",
+    "authors": [
+      "Xiao Liu",
+      "Jiajie Zhang",
+      "Siting Li",
+      "Zuotong Wu",
+      "Yang Yu"
+    ],
+    "emails": [
+      "tsinghua.edu.cn",
+      "~Xiao_Liu15",
+      "mails.tsinghua.edu.cn"
+    ],
+    "rank": 2719
+  },
+  {
+    "url": "https://openreview.net/forum?id=a0yodLze7gs",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      6,
+      5,
+      2
+    ],
+    "rating": "3.90",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Disentanglement is a highly desirable property of representation due to its similarity with human\u2019s understanding and reasoning. This improves interpretability, enables the performance of down-stream tasks, and enables controllable generative models.However, this domain is challenged by the abstract notion and incomplete theories to support unsupervised disentanglement learning. We demonstrate the data itself, such as the orientation of images, plays a crucial role in disentanglement and instead of the factors, and the disentangled representations align the latent variables with the action sequences. We further introduce the concept of disentangling action sequences which facilitates the description of the behaviours of the existing disentangling approaches. An analogy for this process is to discover the commonality between the things and categorizing them. \n\nFurthermore, we analyze the inductive biases on the data and find that the latent information thresholds are correlated with the significance of the actions. For the supervised and unsupervised settings, we respectively introduce two methods to measure the thresholds. We further propose a novel framework, fractional variational autoencoder (FVAE), to disentangle the action sequences with different significance step-by-step. Experimental results on dSprites and 3D Chairs show that FVAE improves the stability of disentanglement.",
+    "title": "Disentangling Action Sequences: Discovering Correlated Samples",
+    "authors": [
+      "Jiantao Wu",
+      "Chunxiuzi Liu",
+      "Lin Wang"
+    ],
+    "emails": [
+      "gmail.com",
+      "vip.qq.com",
+      "~Jiantao_Wu1"
+    ],
+    "rank": 2720
+  },
+  {
+    "url": "https://openreview.net/forum?id=5slGDu_bVc6",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.89",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Deep neural networks often have huge number of parameters, which posts challenges in deployment in application scenarios with limited memory and computation capacity. Knowledge distillation is one approach to derive compact models from bigger ones. \nHowever, it has been observed that a converged heavy teacher model is strongly constrained for learning a compact student network and could make the optimization subject to poor local optima.  In this paper, we propose proKT, a new model-agnostic method by projecting the supervision signals of a teacher model into the student's parameter space. Such projection is implemented by decomposing the training objective  into  local intermediate targets with approximate mirror descent technique. The proposed method could be less sensitive with the quirks during optimization which could result in a better local optima. Experiments on both image and text datasets show that our proposed proKT consistently achieves the state-of-the-art performance comparing to all existing knowledge distillation methods.  ",
+    "title": "Learning from deep model via exploring local targets",
+    "authors": [
+      "Wenxian Shi",
+      "Yuxuan Song",
+      "Hao Zhou",
+      "Bohan Li",
+      "Lei Li"
+    ],
+    "emails": [
+      "~Hao_Zhou6",
+      "~Yuxuan_Song2",
+      "bytedance.com",
+      "~Wenxian_Shi1",
+      "~Lei_Li11"
+    ],
+    "rank": 2721
+  },
+  {
+    "url": "https://openreview.net/forum?id=lXW6Sk1075v",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      5
+    ],
+    "rating": "3.89",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "In this paper, we propose a new type of Actor, named forward-looking Actor or FORK for short, for Actor-Critic algorithms. FORK can be easily integrated into a model-free Actor-Critic algorithm. Our experiments on six Box2D and MuJoCo environments with continuous state and action spaces demonstrate significant performance improvement FORK can bring to the state-of-the-art algorithms. A variation of FORK can further solve BipedalWalkerHardcore in as few as four hours using a single GPU.",
+    "title": "FORK: A FORward-looKing Actor for Model-Free Reinforcement Learning",
+    "authors": [
+      "Honghao Wei",
+      "Lei Ying"
+    ],
+    "emails": [
+      "~Lei_Ying1",
+      "~Honghao_Wei2"
+    ],
+    "rank": 2722
+  },
+  {
+    "url": "https://openreview.net/forum?id=klB-8BpR5N",
+    "ratings": [
+      3,
+      7,
+      3,
+      3
+    ],
+    "rating": "3.89",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "BLANK",
+    "title": "BLANK",
+    "authors": [
+      "Soumya Banerjee",
+      "Vinay P Namboodiri"
+    ],
+    "emails": [
+      "~Soumya_Banerjee1",
+      "~Vinay_P_Namboodiri1"
+    ],
+    "rank": 2723
+  },
+  {
+    "url": "https://openreview.net/forum?id=hga0T0Qcli5",
+    "ratings": [
+      5,
+      5,
+      2,
+      4
+    ],
+    "rating": "3.89",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "We propose an effective embedding model, named QuatRE, to learn quaternion embeddings for entities and relations in knowledge graphs. QuatRE aims to enhance correlations between head and tail entities given a relation within the Quaternion space with Hamilton product. QuatRE achieves this goal by further associating each relation with two relation-aware quaternion vectors which are used to rotate the head and tail entities' quaternion embeddings, respectively. To obtain the triple score, QuatRE rotates the rotated embedding of the head entity using the normalized quaternion embedding of the relation, followed by a quaternion-inner product with the rotated embedding of the tail entity. Experimental results demonstrate that our QuatRE produces state-of-the-art performances on well-known benchmark datasets for knowledge graph completion.",
+    "title": "QuatRE: Relation-Aware Quaternions for Knowledge Graph Embeddings",
+    "authors": [
+      "Dai Quoc Nguyen",
+      "Thanh Vu",
+      "Tu Dinh Nguyen",
+      "Dinh Phung"
+    ],
+    "emails": [
+      "~Dai_Quoc_Nguyen1",
+      "~Tu_Dinh_Nguyen1",
+      "oracle.com",
+      "~Dinh_Phung2"
+    ],
+    "rank": 2724
+  },
+  {
+    "url": "https://openreview.net/forum?id=wMIdpzTmnct",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "3.89",
+    "confidences": [
+      3,
+      4,
+      2
+    ],
+    "abstract": "Designing deep networks robust to adversarial examples remains an open problem. Likewise, recent zeroth order hard-label attacks on image classification tasks have shown comparable performance to their first-order alternatives. It is well known that in this setting, the adversary must search for the nearest decision boundary in a query-efficient manner. State-of-the-art (SotA) attacks rely on the concept of pixel grouping, or super-pixels, to perform efficient boundary search. It was recently shown in the first-order setting, that regular adversarial examples leave the data manifold, and on-manifold examples are generalization errors. In this paper, we argue that query efficiency in the zeroth-order setting is connected to the adversary's traversal through the data manifold. In particular, query-efficient hard-label attacks have the unexpected advantage of finding adversarial examples close to the data manifold. We empirically demonstrate that against both natural and robustly trained models, an efficient zeroth-order attack produces samples with a progressively smaller manifold distance measure. Further, when a normal zeroth-order attack is made query-efficient through the use of pixel grouping, it can make up to a two-fold increase in query efficiency, and in some cases, reduce a sample's distance to the manifold by an order of magnitude.",
+    "title": "Hard-label Manifolds: Unexpected advantages of query efficiency for finding on-manifold adversarial examples",
+    "authors": [
+      "Washington Garcia",
+      "Pin-Yu Chen",
+      "Somesh Jha",
+      "Hamilton Scott Clouse",
+      "Kevin Butler"
+    ],
+    "emails": [
+      "~Washington_Garcia1",
+      "~Pin-Yu_Chen1",
+      "~Somesh_Jha1",
+      "~Kevin_Butler1",
+      "us.af.mil"
+    ],
+    "rank": 2725
+  },
+  {
+    "url": "https://openreview.net/forum?id=whAxkamuuCU",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "3.89",
+    "confidences": [
+      3,
+      4,
+      2
+    ],
+    "abstract": "Neural networks have been shown to have poor compositionality abilities: while they can produce sophisticated output given sufficient data, they perform patchy generalization and fail to generalize to new symbols (e.g. switching a name in a sentence by a less frequent one or one not seen yet). In this paper, we define a class of models whose outputs are equivariant to entity permutations (an analog being convolution networks whose outputs are invariant through translation) without requiring to specify or detect entities in a pre-processing step. We then show how two question-answering models can be made robust to entity permutation using a novel differentiable hybrid semantic-symbolic representation. The benefits of this approach are demonstrated on a set of synthetic NLP tasks where sample complexity and generalization are significantly improved even allowing models to generalize to words that are never seen in the training set. When using only 1K training examples for bAbi, we obtain a test error of 1.8% and fail only one task while the best results reported so far obtained an error of 9.9% and failed 7 tasks.\n",
+    "title": "Symbol-Shift Equivariant Neural Networks",
+    "authors": [
+      "David Salinas",
+      "Hady Elsahar"
+    ],
+    "emails": [
+      "~Hady_Elsahar2",
+      "~David_Salinas1"
+    ],
+    "rank": 2726
+  },
+  {
+    "url": "https://openreview.net/forum?id=-S7-RsPv78e",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      3,
+      5
+    ],
+    "rating": "3.88",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Quantized Neural Networks (QNNs) have achieved an enormous step in improving computational efficiency, making it possible to deploy large models to mobile and miniaturized devices.\nIn order to narrow the performance gap between low-precision and full-precision models, we introduce the natural gradient to train a low-precision model by viewing the parameter space as a Riemannian manifold.\nSpecifically, we propose a novel Optimized Natural Gradient Descent (ONGD) method defined by the Hyperbolic divergence, which provides a perspective to calculate the optimized natural gradient in weak curvature and updates the parameters with an amount of computation comparable to Stochastic Gradient Descent (SGD).\nWe conduct an ablation study and results show that the 4-bit quantized ResNet-32 trained with ONGD has a better result than SGD, i.e. 2.05\\% higher in Top-1 accuracy on CIFAR100 dataset.\nFurther comparison experiments illustrate that our method achieves state-of-the-art results in CIFAR and ImageNet datasets, where the 8-bit version of MobileNet achieves 0.25\\%/0.13\\% higher in Top-1/Top-5 accuracies than the full-precision version on ImageNet dataset.",
+    "title": "Optimizing Quantized Neural Networks in a Weak Curvature Manifold",
+    "authors": [
+      "Jun Chen",
+      "Hanwen Chen",
+      "Jiangning Zhang",
+      "Wenzhou Chen",
+      "Yong Liu",
+      "Yunliang Jiang"
+    ],
+    "emails": [
+      "~Jun_Chen9",
+      "~Yunliang_Jiang2",
+      "zju.edu.cn",
+      "~Yong_Liu11",
+      "~Jiangning_Zhang1"
+    ],
+    "rank": 2727
+  },
+  {
+    "url": "https://openreview.net/forum?id=Bi2OvVf1KPn",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.88",
+    "confidences": [
+      4,
+      5,
+      3,
+      5
+    ],
+    "abstract": "Training deep neural models in the presence of corrupted supervisions is challenging as the corrupted data points may significantly impact the generalization performance.  To alleviate this problem, we present an efficient robust algorithm that achieves strong guarantees without any assumption on the type of corruption and provides a unified framework for both classification and regression problems. Different from many existing approaches that quantify the quality of individual data points (e.g., loss values) and filter out data points accordingly, the proposed algorithm focuses on controlling the collective impact of data points on the averaged gradient.  Even when a corrupted data point failed to be excluded by the proposed algorithm, the data point will have very limited impacts on the overall loss, as compared with state-of-the-art filtering data points based on loss values. Extensive empirical results on multiple benchmark datasets have demonstrated the robustness of the proposed method under different types of corruption.",
+    "title": "Provable Robust Learning for Deep Neural Networks under Agnostic Corrupted Supervision",
+    "authors": [
+      "Boyang Liu",
+      "Mengying Sun",
+      "Ding Wang",
+      "Pang-Ning Tan",
+      "Jiayu Zhou"
+    ],
+    "emails": [
+      "msu.edu",
+      "~Boyang_Liu1",
+      "~Mengying_Sun1",
+      "~Pang-Ning_Tan1",
+      "~Jiayu_Zhou1"
+    ],
+    "rank": 2728
+  },
+  {
+    "url": "https://openreview.net/forum?id=VyDYSMx1sFU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      2,
+      4,
+      6
+    ],
+    "rating": "3.88",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "With the development of computation capability in devices, companies are eager to utilize ML/DL methods to improve their service quality. However, with traditional Machine Learning approaches, companies need to build up a powerful data center to collect data and perform centralized model training, which turns out to be expensive and inefficient. Federated Learning has been introduced to solve this challenge. Because of its characteristics such as model-only exchange and parallel training, the technique can not only preserve user data privacy but also accelerate model training speed. In this paper, we introduce an approach to end-to-end on-device Machine Learning by utilizing Federated Learning. We validate our approach with an important industrial use case, the wheel steering angle prediction in the field of autonomous driving. Our results show that Federated Learning can significantly improve the quality of local edge models and reach the same accuracy level as compared to the traditional centralized Machine Learning approach without its negative effects. Furthermore, Federated Learning can accelerate model training speed and reduce the communication overhead, which proves that this approach has great strength when deploying ML/DL components to real-world embedded systems.",
+    "title": "End-to-End on-device Federated Learning: A case study",
+    "authors": [
+      "Hongyi Zhang",
+      "Jan Bosch",
+      "Helena Holmstr\u00f6m Olsson"
+    ],
+    "emails": [
+      "mau.se",
+      "~Hongyi_Zhang4",
+      "chalmers.se"
+    ],
+    "rank": 2729
+  },
+  {
+    "url": "https://openreview.net/forum?id=wwbDXGPO6X4",
+    "ratings": [
+      5,
+      5,
+      4,
+      2
+    ],
+    "rating": "3.88",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Recent defenses published at venues like NIPS, ICML, ICLR and CVPR are mainly focused on mitigating white-box attacks. These defenses do not properly consider adaptive adversaries. In this paper, we expand the scope of these defenses to include adaptive black-box adversaries. Based on our study of these defenses, we develop three contributions. First we propose a new metric for evaluating adversarial robustness when clean accuracy is impacted. Second, we create an enhanced adaptive black-box attack. Third and most significantly, we develop a novel defense against these adaptive black-box attacks.  Our defense is based on a combination of deep neural networks and simple image transformations. While straight forward in implementation, this defense yields a unique security property which we term buffer zones. We argue that our defense based on buffer zones offers significant improvements over state-of-the-art defenses. We verify our claims through extensive experimentation. Our results encompass three adversarial models (10 different black-box attacks) on 11 defenses with two datasets (CIFAR-10 and Fashion-MNIST).",
+    "title": "Buffer Zone based Defense against Adversarial Examples in Image Classification",
+    "authors": [
+      "Kaleel Mahmood",
+      "Phuong Ha Nguyen",
+      "Lam M. Nguyen",
+      "Thanh V Nguyen",
+      "Marten van Dijk"
+    ],
+    "emails": [
+      "~Thanh_V_Nguyen1",
+      "~Phuong_Ha_Nguyen1",
+      "~Kaleel_Mahmood1",
+      "~Lam_M._Nguyen1",
+      "~Marten_van_Dijk1"
+    ],
+    "rank": 2730
+  },
+  {
+    "url": "https://openreview.net/forum?id=clyAUUnldg",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "rating": "3.88",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "The local gradient points to the direction of the steepest slope in an infinitesimal neighborhood. An optimizer guided by the local gradient is often trapped in local optima when the loss landscape is multi-modal. A directional Gaussian smoothing (DGS) approach was recently proposed in (Zhang et al., 2020) and used to define a truly nonlocal gradient, referred to as the DGS gradient, for high-dimensional black-box optimization.  Promising results show that replacing the traditional local gradient with the DGS gradient can significantly improve the performance of gradient-based methods in optimizing highly multi-modal loss functions.  However, the optimal performance of the DGS gradient may rely on fine tuning of two important hyper-parameters, i.e., the smoothing radius and the learning rate.  In this paper, we present a simple, yet ingenious and efficient adaptive approach for optimization with the DGS gradient, which removes the need of hyper-parameter fine tuning.  Since the DGS gradient generally points to a good search direction, we perform a line search along the DGS direction to determine the step size at each iteration. The learned step size in turn will inform us of the scale of function landscape in the surrounding area, based on which we adjust the smoothing radius accordingly for the next iteration. We present experimental results on high-dimensional benchmark functions, an airfoil design problem and a game content generation problem. The AdaDGS method has shown superior performance over several the state-of-the-art black-box optimization methods.",
+    "title": "AdaDGS: An adaptive black-box optimization method with a nonlocal directional Gaussian smoothing gradient",
+    "authors": [
+      "Hoang A Tran",
+      "Guannan Zhang"
+    ],
+    "emails": [
+      "~Hoang_A_Tran1",
+      "~Guannan_Zhang1"
+    ],
+    "rank": 2731
+  },
+  {
+    "url": "https://openreview.net/forum?id=qUzxZj13RWY",
+    "ratings": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.87",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The choice of learning rate has been explored in many stochastic optimization frameworks to adaptively tune the step-size of gradients in iterative training of deep neural networks. While adaptive optimizers (e.g. AdaM, AdaGrad, RMSProp, AdaBound) offer fast convergence, they exhibit poor generalization characteristics. To achieve better performance, the manual scheduling of learning rates (e.g. step-decaying, cyclical-learning, warmup) is often used but requires expert domain knowledge. It provides limited insight into the nature of the updating rules and recent studies show that different generalization characteristics are observed with different experimental setups. In this paper, rather than raw statistic measurements from gradients (which many adaptive optimizers use), we explore the useful information carried between gradient updates. We measure the energy norm of the low-rank factorization of convolution weights in a convolution neural network to define two probing metrics; knowledge gain and mapping condition. By means of these metrics, we provide empirical insight into the different generalization characteristics of adaptive optimizers. Further, we propose a new optimizer--AdaS--to adaptively regulate the learning rate by tracking the rate of change in knowledge gain. Experimentation in several setups reveals that AdaS exhibits faster convergence and superior generalization over existing adaptive learning methods.",
+    "title": "AdaS: Adaptive Scheduling of Stochastic Gradients",
+    "authors": [
+      "Mahdi S. Hosseini",
+      "Konstantinos N Plataniotis"
+    ],
+    "emails": [
+      "~Mahdi_S._Hosseini1",
+      "~Konstantinos_N_Plataniotis1"
+    ],
+    "rank": 2732
+  },
+  {
+    "url": "https://openreview.net/forum?id=85d8bg9RvDT",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "rating": "3.87",
+    "confidences": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "abstract": "One of the core problems in large-scale recommendations is to retrieve top relevant candidates accurately and efficiently, preferably in sub-linear time. Previous approaches are mostly based on a two-step procedure: first learn an inner-product model and then use maximum inner product search (MIPS) algorithms to search top candidates, leading to potential loss of retrieval accuracy. In this paper, we present Deep Retrieval (DR), an end-to-end learnable structure model for large-scale recommendations. DR encodes all candidates into a discrete latent space. Those latent codes for the candidates are model parameters and to be learnt together with other neural network parameters to maximize the same objective function. With the model learnt, a beam search over the latent codes is performed to retrieve the top candidates. Empirically, we showed that DR, with sub-linear computational complexity, can achieve almost the same accuracy as the brute-force baseline.",
+    "title": "Deep Retrieval: An End-to-End Structure Model for Large-Scale Recommendations",
+    "authors": [
+      "Weihao Gao",
+      "Xiangjun Fan",
+      "Jiankai Sun",
+      "Kai Jia",
+      "Wenzhi Xiao",
+      "Chong Wang",
+      "Xiaobing Liu"
+    ],
+    "emails": [
+      "~Weihao_Gao1",
+      "~Chong_Wang8",
+      "bytedance.com",
+      "~Xiaobing_Liu1"
+    ],
+    "rank": 2733
+  },
+  {
+    "url": "https://openreview.net/forum?id=kB8DkEKSDH",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.87",
+    "confidences": [
+      2,
+      4,
+      4,
+      5
+    ],
+    "abstract": "This paper introduces an off-policy reinforcement learning method that uses Hellinger distance between sampling policy (from what samples were collected) and current policy (policy being optimized) as a constraint. \nHellinger distance squared multiplied by two is greater than or equal to total variation distance squared and less than or equal to Kullback-Leibler divergence, therefore a lower bound for expected discounted return for the new policy is improved compared to the lower bound for training with KL. \nAlso, Hellinger distance is less than or equal to 1, so there is a policy-independent lower bound for expected discounted return. \nHDCR is capable of training with Experience Replay, a common setting for distributed RL when collecting trajectories using different policies and learning from this data centralized. \nHDCR shows results comparable to or better than Advantage-weighted Behavior Model and Advantage-Weighted Regression on MuJoCo tasks using tiny offline datasets collected by random agents. On bigger datasets (100k timesteps) obtained by pretrained behavioral policy, HDCR outperforms ABM and AWR methods on 3 out of 4 tasks. ",
+    "title": "Hellinger Distance Constrained Regression",
+    "authors": [
+      "Egor Rotinov"
+    ],
+    "emails": [
+      "~Egor_Rotinov1"
+    ],
+    "rank": 2734
+  },
+  {
+    "url": "https://openreview.net/forum?id=RVhzamxHBjP",
+    "ratings": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "rating": "3.87",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In many physical systems, inputs related by intrinsic system symmetries are mapped to the same output. When inverting such systems, i.e., solving the associated inverse problems, there is no unique solution. This causes fundamental difficulties for deploying the emerging end-to-end deep learning approach. Using the phase retrieval problem as an illustrative\nexample, we show that careful symmetry breaking on the training data can help get rid of the\ndifficulties and significantly improve learning performance in real data experiments. We also extract and highlight\nthe underlying mathematical principle of the proposed solution, which is directly applicable to\nother inverse problems. ",
+    "title": "Inverse Problems, Deep Learning, and Symmetry Breaking",
+    "authors": [
+      "Kshitij Tayal",
+      "Chieh-Hsin Lai",
+      "Raunak Manekar",
+      "Zhong Zhuang",
+      "Vipin Kumar",
+      "Ju Sun"
+    ],
+    "emails": [
+      "~Raunak_Manekar1",
+      "~Vipin_Kumar1",
+      "~Zhong_Zhuang1",
+      "~Ju_Sun2",
+      "~Chieh-Hsin_Lai1",
+      "~Kshitij_Tayal1"
+    ],
+    "rank": 2735
+  },
+  {
+    "url": "https://openreview.net/forum?id=yuXQOhKRjBr",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.86",
+    "confidences": [
+      3,
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks (GNNs) offer us an effective framework for graph representation learning via layer-wise neighborhood aggregation. Their success is attributed to their expressive power at learning representation of nodes and graphs. To achieve GNNs with high expressive power, existing methods mainly resort to complex neighborhood aggregation functions, e.g., designing injective aggregation function or using multiple aggregation functions. Consequently, their expressive power is limited by the capability of aggregation function, which is tricky to determine in practice. To combat this problem, we propose a novel framework, namely diverse sampling, to improve the expressive power of GNNs. For a target node, diverse sampling offers it diverse neighborhoods, i.e., rooted sub-graphs, and the representation of target node is finally obtained via aggregating the representation of diverse neighborhoods obtained using any GNN model. High expressive power is guaranteed by the diversity of different neighborhoods. We use classical GNNs (i.e., GCN and GAT) as base models to evaluate the effectiveness of the proposed framework. Experiments are conducted at multi-class node classification task on three benchmark datasets and multi-label node classification task on a dataset collected in this paper. Extensive experiments demonstrate the proposed method consistently improve the performance of base GNN models. The proposed framework is applicable to any GNN models and thus is general for improving the expressive power of GNNs.",
+    "title": "Towards Powerful Graph Neural Networks: Diversity Matters",
+    "authors": [
+      "Xu Bingbing",
+      "Huawei Shen",
+      "Qi Cao",
+      "Yuanhao Liu",
+      "Keting Cen",
+      "Xueqi Cheng"
+    ],
+    "emails": [
+      "~Xu_Bingbing1",
+      "~Huawei_Shen1",
+      "~Xueqi_Cheng1",
+      "~Qi_Cao1",
+      "ict.ac.cn"
+    ],
+    "rank": 2736
+  },
+  {
+    "url": "https://openreview.net/forum?id=2rcgRSAa1A3",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.86",
+    "confidences": [
+      4,
+      4,
+      4,
+      2
+    ],
+    "abstract": "Recommender engines play a role in the emergence and reinforcement of filter bubbles. When these systems learn that a user prefers content from a particular site, the user will be less likely to be exposed to different sources or opinions and, ultimately, is more likely to develop extremist tendencies.\nWe trace the roots of this phenomenon to the way the recommender engine represents news articles. The vectorial features modern systems extract from the plain text of news articles are already highly predictive of the associated news outlet. We propose a new training scheme based on adversarial machine learning to tackle this issue . Our  experiments show that the features we can extract this way are significantly less predictive of the news outlet and thus offer the possibility to reduce the risk of manifestation of new filter bubbles. We validate our intuitions in a news recommendation task using a recent attention-based recommendation system.",
+    "title": "Fighting Filterbubbles with Adversarial BERT-Training for News-Recommendation",
+    "authors": [
+      "Lukas Pfahler",
+      "Katharina Morik"
+    ],
+    "emails": [
+      "~Katharina_Morik1",
+      "~Lukas_Pfahler1"
+    ],
+    "rank": 2737
+  },
+  {
+    "url": "https://openreview.net/forum?id=DE0MSwKv32y",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      4,
+      2
+    ],
+    "rating": "3.86",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "We propose <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">trust-but-verify</mtext></math></mjx-assistive-mml></mjx-container> (TBV) mechanism, a new method which uses model uncertainty estimates to guide  exploration. The mechanism augments graph search planning algorithms by the capacity to deal with learned model's imperfections. We identify certain type of frequent model errors, which we dub <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">false loops</mtext></math></mjx-assistive-mml></mjx-container>, and which are particularly dangerous for graph search algorithms in discrete environments. These errors impose falsely pessimistic expectations and thus hinder exploration. We confirm this experimentally and show that TBV can effectively alleviate them. TBV combined with MCTS or Best First Search forms an effective model-based reinforcement learning solution, which is able to robustly solve sparse reward problems. ",
+    "title": "Trust, but verify: model-based exploration in sparse reward environments",
+    "authors": [
+      "Konrad Czechowski",
+      "Tomasz Odrzyg\u00f3\u017ad\u017a",
+      "Micha\u0142 Izworski",
+      "Marek Zbysi\u0144ski",
+      "\u0141ukasz Kuci\u0144ski",
+      "Piotr Mi\u0142o\u015b"
+    ],
+    "emails": [
+      "impan.pl",
+      "gmail.com",
+      "~\u0141ukasz_Kuci\u0144ski1",
+      "~Piotr_Mi\u0142o\u015b1",
+      "student.uw.edu.pl",
+      "~Konrad_Czechowski1"
+    ],
+    "rank": 2738
+  },
+  {
+    "url": "https://openreview.net/forum?id=1qJtBS8QF9",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.86",
+    "confidences": [
+      5,
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Recent years, methods based on neural networks have made great achievements in solving large and complex graph problems. However, high efficiency of these methods depends on large training and validation sets, while the acquisition of ground-truth labels is expensive and time-consuming. In this paper, a graph view-consistent learning network (GVCLN) is specially designed for the semi-supervised learning when the number of the labeled samples is very small. We fully exploit the neighborhood aggregation capability of GVCLN and use dual views to obtain different representations. Although the two views have different viewing angles, their observation objects are the same, so their observation representations need to be consistent. For view-consistent representations between two views, two loss functions are designed besides a supervised loss. The supervised loss uses the known labeled set, while a view-consistent loss is applied to the two views to obtain the consistent representation and a pseudo-label loss is designed by using the common high-confidence predictions. GVCLN with these loss functions can obtain the view-consistent representations of the original feature. We also find that preprocessing the node features with specific filter before training is good for subsequent classification tasks. Related experiments have been done on the three citation network datasets of Cora, Citeseer, and PubMed. On several node classification tasks, GVCLN achieves state-of-the-art performance.",
+    "title": "Graph View-Consistent Learning Network",
+    "authors": [
+      "Zhuolin Liao",
+      "Kun Zhan"
+    ],
+    "emails": [
+      "~Kun_Zhan1",
+      "lzu.edu.cn"
+    ],
+    "rank": 2739
+  },
+  {
+    "url": "https://openreview.net/forum?id=rUm-WPEAQE",
+    "ratings": [
+      3,
+      4,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.86",
+    "confidences": [
+      3,
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "We consider the problem of distributed optimization over a network, using a stochastic variance reduced gradient (SVRG) algorithm, where executing every iteration of the algorithm requires computation and exchange of gradients among network nodes. These tasks always consume network resources, including communication bandwidth and battery power, which we model as a general cost function. In this paper, we consider an SVRG algorithm with arbitrary sampling (SVRG-AS), where the nodes are sampled according to some distribution. We characterize the convergence of SVRG-AS, in terms of this distribution. We determine the distribution that minimizes the costs associated with running the algorithm, with provable convergence guarantees. We show that our approach can substantially outperform vanilla SVRG and its variants in terms of both convergence rate and total cost of running the algorithm. We then show how our approach can optimize the mini-batch size to address the tradeoff between low communication cost and fast convergence rate. Comprehensive theoretical and numerical analyses on real datasets reveal that our algorithm can significantly reduce the cost, especially in large and heterogeneous networks. Our results provide important practical insights for using machine learning over Internet-of-Things.",
+    "title": "Cost-efficient SVRG with Arbitrary Sampling",
+    "authors": [
+      "Hossein S. Ghadikolaei",
+      "Thomas Ohlson Timoudas",
+      "Carlo Fischione"
+    ],
+    "emails": [
+      "~Carlo_Fischione1",
+      "~Hossein_S._Ghadikolaei1",
+      "kth.se"
+    ],
+    "rank": 2740
+  },
+  {
+    "url": "https://openreview.net/forum?id=4ja9sJJygb",
+    "ratings": [
+      3,
+      5,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.85",
+    "confidences": [
+      4,
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Pruning and quantization are proven methods for improving the performance and storage efficiency of convolutional neural networks (CNNs). Pruning removes near-zero weights in tensors and masks weak connections between neurons in neighbouring layers. Quantization reduces the precision of weights by replacing them with numerically similar values that require less storage. In this paper we identify another form of redundancy in CNN weight tensors, in the form of repeated patterns of similar values. We observe that pruning and quantization both tend to drastically increase the number of repeated patterns in the weight tensors.\nWe investigate several compression schemes to take advantage of this structure in CNN weight data, including multiple forms of Huffman coding, and other approaches inspired by block sparse matrix formats. We evaluate our approach on several well-known CNNs and find that we can achieve compaction ratios of 1.4x to 3.1x in addition to the saving from pruning and quantization.",
+    "title": "Exploiting Weight Redundancy in CNNs: Beyond Pruning and Quantization",
+    "authors": [
+      "Yuan Wen",
+      "David Gregg"
+    ],
+    "emails": [
+      "~Yuan_Wen1",
+      "cs.tcd.ie"
+    ],
+    "rank": 2741
+  },
+  {
+    "url": "https://openreview.net/forum?id=HFJWWQP3ado",
+    "ratings": [
+      4,
+      4,
+      5,
+      2
+    ],
+    "rating": "3.83",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "In this paper, we study the importance of pruning in Deep Networks (DNs) and motivate it based on the current absence of data aware weight initialization. Current DN initializations, focusing primarily at maintaining first order statistics of the feature maps through depth, force practitioners to overparametrize a model in order to reach high performances. This overparametrization can then be pruned a posteriori, leading to a phenomenon known as \"winning tickets\".  However, the pruning literature still relies on empirically investigations, lacking a theoretical understanding of (1) how pruning affects the decision boundary, (2) how to interpret pruning, (3) how to design principled pruning techniques, and (4) how to theoretically study pruning. To tackle those questions, we propose to employ recent advances in theoretical analysis of Continuous Piecewise Affine (CPA) DNs. From this viewpoint, we can study the DNs' input space partitioning and detect the early-bird (EB) phenomenon, guide practitioners by identifying when to stop the first training step, provide interpretability into current pruning techniques, and develop a principled pruning criteria towards efficient DN training. Finally, we conduct extensive experiments to shown the effectiveness of the proposed spline pruning criteria in terms of both layerwise and global pruning over state-of-the-art pruning methods.\nAll the codes will be released publicly upon acceptance.",
+    "title": "Max-Affine Spline Insights Into Deep Network Pruning",
+    "authors": [
+      "Randall Balestriero",
+      "Haoran You",
+      "Zhihan Lu",
+      "Yutong Kou",
+      "Yingyan Lin",
+      "Richard Baraniuk"
+    ],
+    "emails": [
+      "~Yingyan_Lin1",
+      "~Haoran_You1",
+      "~Randall_Balestriero1",
+      "~Zhihan_Lu1",
+      "~Yutong_Kou1",
+      "~Richard_Baraniuk1"
+    ],
+    "rank": 2742
+  },
+  {
+    "url": "https://openreview.net/forum?id=Aoq37n5bhpJ",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      3,
+      3
+    ],
+    "rating": "3.83",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Federated learning has received attention for its efficiency and privacy benefits,in settings where data is distributed among devices.   Although federated learning shows significant promise as a key approach when data cannot be shared or centralized, current incarnations show limited privacy properties and have short-comings  when  applied  to  common  real-world  scenarios.   One  such  scenario is heterogeneous data among devices, where data may come from different generating distributions. In this paper, we propose a federated learning framework using a mixture of experts to balance the specialist nature of a locally trained model with the generalist knowledge of a global model in a federated learning setting.  Our results show that the mixture of experts model is better suited as a personalized model for devices when data is heterogeneous, outperforming both global and lo-cal models.  Furthermore, our framework gives strict privacy guarantees, which allows clients to select parts of their data that may be excluded from the federation.   The evaluation shows that the proposed solution is robust to the setting where some users require a strict privacy setting and do not disclose their models to a central server at all, opting out from the federation partially or entirely.  The proposed framework is general enough to include any kind of machine learning models, and can even use combinations of different kinds.",
+    "title": "Federated learning using mixture of experts",
+    "authors": [
+      "Edvin Listo Zec",
+      "John Martinsson",
+      "Olof Mogren",
+      "Leon Ren\u00e9 S\u00fctfeld",
+      "Daniel Gillblad"
+    ],
+    "emails": [
+      "~Daniel_Gillblad1",
+      "~Edvin_Listo_Zec1",
+      "~John_Martinsson1",
+      "~Leon_Ren\u00e9_S\u00fctfeld1",
+      "~Olof_Mogren1"
+    ],
+    "rank": 2743
+  },
+  {
+    "url": "https://openreview.net/forum?id=7Z29QbHxIL",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      4
+    ],
+    "rating": "3.83",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "Existing one-shot neural architecture search (NAS) methods generally contain a giant supernet, which leads to heavy computational cost. Our method, named FTSO, separates the whole architecture search into two sub-steps. In the first step, we only search for the topology, and in the second step, we only search for the operators. FTSO not only reduces NAS\u2019s search time from days to 0.68 seconds, but also significantly improves the accuracy. Specifically, our experiments on ImageNet show that within merely 18 seconds, FTSO can achieve 76.4% testing accuracy, 1.5% higher than the baseline, PC-DARTS. In addition, FTSO can reach 97.77% testing accuracy, 0.27% higher than the baseline, with 99.8% of search time saved on CIFAR10.",
+    "title": "FTSO: Effective NAS via First Topology Second Operator",
+    "authors": [
+      "Likang Wang",
+      "Lei Chen"
+    ],
+    "emails": [
+      "~Likang_Wang1",
+      "~Lei_Chen7"
+    ],
+    "rank": 2744
+  },
+  {
+    "url": "https://openreview.net/forum?id=KIfbqntFnOc",
+    "ratings": [
+      7,
+      6,
+      5,
+      1
+    ],
+    "rating": "3.83",
+    "confidences": [
+      2,
+      2,
+      3,
+      5
+    ],
+    "abstract": "Residual neural networks (ResNets) can be modeled as dynamical systems where the evolution of dynamical systems represents the inference in ResNets. We exploit this connection and the theory of stochastic dynamical systems to construct a novel ensemble of It\u00f4 processes as a new deep learning representation that is more robust than classical residual networks. An It\u00f4 process obtained by solving a suitably-formulated stochastic differential equation derived from a residual network has a probability density function that is not readily perturbed by small changes in the neural network\u2019s inputs. Our robust stochastic It\u00f4 ensemble of neural networks achieve an accuracy of 73.91% on the CIFAR-10 dataset against the PGD attack with \u03b5 = 2.0 under the L2 norm, while the accuracy of Madry\u2019s robustness toolbox on the same attack is 18.59%. Similarly, our stochastic It\u00f4 ensemble of neural networks achieves an accuracy of 79.66% on PGD attack with \u03b5 = 16/255 under the L\u221e norm, while the accuracy of Madry\u2019s robustness toolbox on the same attack is 18.13%. The It\u00f4 ensemble trained on ImageNet achieves an accuracy of 28.53% against PGD attacks under the L\u221e norm with \u03b5 = 16/255 and accuracy of 65.74% under the L2 norm with \u03b5 = 3.0, respectively. This significantly improves state-of-the-art accuracy of 5% and 35.16% for Madry\u2019s robustness tool against the same PGD attacks under the L\u221e and L2 norms, respectively. Further, our approach achieves these high robustness values without any explicit adversarial training or a significant loss of accuracy on benign inputs.",
+    "title": "Robust Ensembles of Neural Networks using It\u00f4 Processes",
+    "authors": [
+      "Sumit Kumar Jha",
+      "Susmit Jha",
+      "Rickard Ewetz",
+      "Alvaro Velasquez"
+    ],
+    "emails": [
+      "~Susmit_Jha1",
+      "~Rickard_Ewetz1",
+      "gmail.com",
+      "~Sumit_Kumar_Jha2"
+    ],
+    "rank": 2745
+  },
+  {
+    "url": "https://openreview.net/forum?id=WweBNiwWkZh",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      6,
+      4,
+      4
+    ],
+    "rating": "3.83",
+    "confidences": [
+      4,
+      1,
+      3,
+      4
+    ],
+    "abstract": "We present a novel learning framework for cloth deformation by embedding virtual cloth into a tetrahedral mesh that parametrizes the volumetric region of air surrounding the underlying body. In order to maintain this volumetric parameterization during character animation, the tetrahedral mesh is constrained to follow the body surface as it deforms. We embed the cloth mesh vertices into this parameterization of three-dimensional space in order to automatically capture much of the nonlinear deformation due to both joint rotations and collisions. We then train a convolutional neural network to recover ground truth deformation by learning cloth embedding offsets for each skeletal pose. Our experiments show significant improvement over learning cloth offsets from body surface parameterizations, both quantitatively and visually, with prior state of the art having a mean error five standard deviations higher than ours. Without retraining, our neural network generalizes to other body shapes and T-shirt sizes, giving the user some indication of how well clothing might fit. Our results demonstrate the efficacy of a general learning paradigm where high-frequency details can be embedded into low-frequency parameterizations.",
+    "title": "Skinning a Parameterization of Three-Dimensional Space for Neural Network Cloth",
+    "authors": [
+      "Jane Wu",
+      "Zhenglin Geng",
+      "Hui Zhou",
+      "Ronald Fedkiw"
+    ],
+    "emails": [
+      "jd.com",
+      "stanford.edu",
+      "~Jane_Wu2",
+      "~Ronald_Fedkiw1"
+    ],
+    "rank": 2746
+  },
+  {
+    "url": "https://openreview.net/forum?id=pAj7zLJK05U",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      5,
+      3,
+      3
+    ],
+    "rating": "3.82",
+    "confidences": [
+      4,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Deep Neural Networks (DNNs) have been shown vulnerable to adversarial attacks, which could produce adversarial samples that easily fool the state-of-the-art DNNs. The harmfulness of adversarial attacks calls for the defense mechanisms under fire. However, the relationship between adversarial attacks and defenses is like spear and shield. Whenever a defense method is proposed, a new attack would be followed to bypass the defense immediately. Devising a definitive defense against new attacks~(zero-day attacks) is proven to be challenging. We tackle this challenge by characterizing the intrinsic properties of adversarial samples, via measuring the norm of the perturbation after a counterattack. Our method is based on the idea that, from an optimization perspective, adversarial samples would be closer to the decision boundary; thus the perturbation to counterattack adversarial samples would be significantly smaller than normal cases. Motivated by this, we propose AttackDist, an attack-agnostic property to characterize adversarial samples. We first theoretically clarify under which condition AttackDist can provide a certified detecting performance, then show that a potential application of AttackDist is distinguishing zero-day adversarial examples without knowing the mechanisms of new attacks. As a proof-of-concept, we evaluate AttackDist on two widely used benchmarks. The evaluation results show that AttackDist can outperform the state-of-the-art detection measures by large margins in detecting zero-day adversarial attacks.",
+    "title": "AttackDist: Characterizing Zero-day Adversarial Samples by Counter Attack",
+    "authors": [
+      "Simin Chen",
+      "Zihe Song",
+      "Lei Ma",
+      "Cong Liu",
+      "Wei Yang"
+    ],
+    "emails": [
+      "~Simin_Chen1",
+      "~Lei_Ma1",
+      "~Cong_Liu2",
+      "~Zihe_Song1",
+      "utdallas.edu"
+    ],
+    "rank": 2747
+  },
+  {
+    "url": "https://openreview.net/forum?id=pbXQtKXwLS",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.81",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We propose a simple, data-driven approach to help guide hyperparameter selection for neural network initialization. We leverage the relationship between neural network and Gaussian process models having corresponding activation and covariance functions to infer the hyperparameter values desirable for model initialization.  Our experiment shows that marginal likelihood maximization provides recommendations that yield near-optimal prediction performance on MNIST classification task under experiment constraints.  Furthermore, our empirical results indicate consistency in the proposed technique, suggesting that computation cost for the procedure could be significantly reduced with smaller training sets.    ",
+    "title": "Guiding Neural Network Initialization via Marginal Likelihood Maximization",
+    "authors": [
+      "Anthony Tai",
+      "Chunfeng Huang"
+    ],
+    "emails": [
+      "~Anthony_Tai1",
+      "~Chunfeng_Huang1"
+    ],
+    "rank": 2748
+  },
+  {
+    "url": "https://openreview.net/forum?id=cP2fJWhYZe0",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      2,
+      5
+    ],
+    "rating": "3.81",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Image classifiers are typically scored on their test set accuracy, but high accuracy can mask a subtle type of model failure. We find that high scoring convolutional neural networks (CNNs) on popular benchmarks exhibit troubling pathologies that allow them to display high accuracy even in the absence of semantically salient features. When a model provides a high-confidence decision without salient supporting input features, we say the classifier has overinterpreted its input, finding too much class-evidence in patterns that appear nonsensical to humans. Here, we demonstrate that neural networks trained on CIFAR-10 and ImageNet suffer from overinterpretation, and we find models on CIFAR-10 make confident predictions even when 95% of input images are masked and humans cannot discern salient features in the remaining pixel-subsets. Although these patterns portend potential model fragility in real-world deployment, they are in fact valid statistical patterns of the benchmark that alone suffice to attain high test accuracy. Unlike adversarial examples, overinterpretation relies upon unmodified image pixels.  We find ensembling and input dropout can each help mitigate overinterpretation.",
+    "title": "Overinterpretation reveals image classification model pathologies",
+    "authors": [
+      "Brandon Carter",
+      "Siddhartha Jain",
+      "Jonas Mueller",
+      "David Gifford"
+    ],
+    "emails": [
+      "~Siddhartha_Jain1",
+      "~Jonas_Mueller1",
+      "~Brandon_Carter1",
+      "~David_Gifford1"
+    ],
+    "rank": 2749
+  },
+  {
+    "url": "https://openreview.net/forum?id=dluhjOg0qKn",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      5
+    ],
+    "rating": "3.81",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In the low-data regime, it is difficult to train good supervised models from scratch.\nInstead practitioners turn to pre-trained models, leveraging transfer learning. Ensembling is an empirically and theoretically appealing way to construct powerful predictive models, but the predominant approach of training multiple deep networks with different random initialisations collides with the need for transfer via pre-trained weights. In this work, we study different ways of creating ensembles from pre-trained models. We show that the nature of pre-training itself is a performant source of diversity, and propose a practical algorithm that efficiently identifies a subset of pre-trained models for any downstream dataset. The approach is simple: Use nearest-neighbour accuracy to rank pre-trained models, fine-tune the best ones with a small hyperparameter sweep, and greedily construct an ensemble to minimise validation cross-entropy. When evaluated together with strong baselines on 19 different downstream tasks (the Visual Task Adaptation Benchmark), this achieves state-of-the-art performance at a much lower inference budget, even when selecting from over 2,000 pre-trained models. We also assess our ensembles on ImageNet variants and show improved robustness to distribution shift.",
+    "title": "Deep Ensembles for Low-Data Transfer Learning",
+    "authors": [
+      "Basil Mustafa",
+      "Carlos Riquelme Ruiz",
+      "Joan Puigcerver",
+      "Andr\u00e9 Susano Pinto",
+      "Daniel Keysers",
+      "Neil Houlsby"
+    ],
+    "emails": [
+      "~Basil_Mustafa1",
+      "~Andr\u00e9_Susano_Pinto1",
+      "~Joan_Puigcerver1",
+      "~Neil_Houlsby1",
+      "~Carlos_Riquelme_Ruiz1",
+      "~Daniel_Keysers2"
+    ],
+    "rank": 2750
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ljcb2tylYn1",
+    "ratings": [
+      5,
+      5,
+      2,
+      4
+    ],
+    "rating": "3.81",
+    "confidences": [
+      5,
+      2,
+      5,
+      4
+    ],
+    "abstract": "Deep-learning based image classification is applied in this studies to the Luria's alternating series tests to support diagnostics of the Parkinson's disease. Luria's alternating series tests belong to the family of fine-motor drawing tests and been used in neurology and psychiatry for nearly a century. Introduction of the digital tables and later tablet PCs has allowed deviating from the classical paper and pen setting, and observe kinematic and pressure parameters describing the test. While such setting has led to a highly accurate machine learning models, the visual component of the tests is left unused. Namely, the shapes of the drawn lines are not used to classify the drawings, which eventually has caused the shift in the assessment paradigm from visual-based to the numeric parameters based. The approach proposed in this paper allows combining two assessment paradigms by augmenting initial drawings by the kinematic and pressure parameters. The paper demonstrates that the resulting network has the accuracy similar to those of human practitioner.",
+    "title": "CNN Based Analysis of the Luria\u2019s Alternating Series Test for Parkinson\u2019s Disease Diagnostics",
+    "authors": [
+      "Sergei Zarembo",
+      "Sven Nomm",
+      "Kadri Medijainen",
+      "Pille Taba",
+      "Aaro Toomela"
+    ],
+    "emails": [
+      "~Sven_Nomm1",
+      "tlu.ee",
+      "taltech.ee",
+      "kliinikum.ee",
+      "ut.ee"
+    ],
+    "rank": 2751
+  },
+  {
+    "url": "https://openreview.net/forum?id=16WMyqeYgw",
+    "ratings": [
+      5,
+      5,
+      4,
+      2
+    ],
+    "rating": "3.81",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "This paper introduces a novel method for constructing an upper bound for exploration policy using either the weighted variance of return sequences or the weighted temporal difference (TD) error.  We demonstrate that the variance of the return sequence for a specific state-action pair is an important information source that can be leveraged to guide exploration in reinforcement learning.  The intuition is that fluctuation in the return sequence indicates greater uncertainty in the near future returns.  This  divergence occurs because of the cyclic nature of value-based reinforcement learning; the evolving value function begets policy improvements which in turn modify the value function.  Although both variance and TD errors capture different aspects of this uncertainty, our analysis shows that both can be valuable to guide exploration.  We propose a two-stream network architecture to estimate weighted variance/TD errors within DQN agents for our exploration method and show that it outperforms the baseline on a wide range of Atari games.",
+    "title": "Leveraging the Variance of Return Sequences for Exploration Policy",
+    "authors": [
+      "Zerong Xi",
+      "Gita Sukthankar"
+    ],
+    "emails": [
+      "~Gita_Sukthankar1",
+      "~Zerong_Xi1"
+    ],
+    "rank": 2752
+  },
+  {
+    "url": "https://openreview.net/forum?id=8uv1YXVi80",
+    "ratings": [
+      5,
+      4,
+      5,
+      2
+    ],
+    "rating": "3.81",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Deep Learning (DL) models are known to be heavily over-parametrized, resulting in a large memory footprint and power consumption. This hampers the use of such models in hardware-constrained edge technologies such as wearables and mobile devices. Model compression during training can be achieved by promoting sparse network structures both through weight regularization and by leveraging dynamic pruning methods. State-of-the-art pruning methods are however mostly magnitude-based which impedes their use in e.g. binary settings. Importantly, most of the pruning methods do not provide a structural sparsity, resulting in an inefficient memory allocation and access for hardware implementations.  In this paper, we propose a novel dynamic pruning solution that we term Dynamic Probabilistic Pruning (DPP). DPP leverages Gumbel top-K sampling to select subsets of weights during training, which enables exploring which weights are most relevant. Our approach allows for setting an explicit per-neuron layer-wise sparsity level and structural pruning across weights and feature maps, without relying on weight magnitude heuristics. Relevantly, our method generates a hardware-oriented structural sparsity for fully-connected and convolutional layers that facilitates memory allocation and access, in contrast with conventional unstructured pruning.\nWe show that DPP achieves competitive sparsity levels and classification accuracy on MNIST and CIFAR-10, CIFAR-100 datasets compared to a state-of-the-art baseline for various DL architectures, while respecting per-neuron sparsity constraints.",
+    "title": "Dynamic Probabilistic Pruning: Training sparse networks based on stochastic and dynamic masking",
+    "authors": [
+      "Lizeth Gonzalez Carabarin",
+      "Iris A.M. Huijben",
+      "Bastiaan S. Veeling",
+      "Alexandre Schmid",
+      "Ruud Van Sloun"
+    ],
+    "emails": [
+      "~Iris_A.M._Huijben1",
+      "~Ruud_Van_Sloun1",
+      "~Bastiaan_S._Veeling1",
+      "epfl.ch",
+      "~Lizeth_Gonzalez_Carabarin1"
+    ],
+    "rank": 2753
+  },
+  {
+    "url": "https://openreview.net/forum?id=g1KmTQhOhag",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.81",
+    "confidences": [
+      5,
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Transformer-based models have achieved state-of-the-art results in many natural language processing tasks. The self-attention architecture allows transformer to combine information from all elements of a sequence into context-aware representations. However, information about the context is stored mostly in the same element-wise representations. This might limit the processing of properties related to the sequence as a whole more difficult. Adding trainable memory to selectively store local as well as global representations of a sequence is a promising direction to improve the Transformer model. Memory-augmented neural networks (MANNs) extend traditional neural architectures with general-purpose memory for representations. MANNs have demonstrated the capability to learn simple algorithms like Copy or Reverse and can be successfully trained via backpropagation on diverse tasks from question answering to language modeling outperforming RNNs and LSTMs of comparable complexity. In this work, we propose and study few extensions of the Transformer baseline (1) by adding memory tokens to store non-local representations, (2) creating memory bottleneck for the global information, (3) controlling memory update with dedicated layer. We evaluate these memory augmented Transformers and demonstrate that presence of memory positively correlates with the model performance for machine translation and language modelling tasks. Augmentation of pre-trained masked language model with memory tokens shows mixed results for tasks from GLUE benchmark. Visualization of attention patterns over the memory suggest that it improves the model's ability to process a global context.",
+    "title": "Memory Representation in Transformer",
+    "authors": [
+      "Mikhail Burtsev",
+      "Yurii Kuratov",
+      "Anton Peganov",
+      "Grigory V. Sapunov"
+    ],
+    "emails": [
+      "~Grigory_V._Sapunov1",
+      "~Mikhail_Burtsev1",
+      "phystech.edu",
+      "nvidia.com"
+    ],
+    "rank": 2754
+  },
+  {
+    "url": "https://openreview.net/forum?id=xoPj3G-OKNM",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.81",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Stochastic gradient descent (SGD) and its variants have been the dominating optimization methods in machine learning. Compared with small batch training, SGD with large batch training can better utilize the computational power of current multi-core systems like GPUs and can reduce the number of communication rounds in distributed training. Hence, SGD with large batch training has attracted more and more attention. However, existing empirical results show that large batch training typically  leads to a drop of generalization accuracy. As a result, large batch training has also become a challenging topic. In this paper, we propose a novel method, called stochastic normalized gradient descent with momentum (SNGM), for large batch training. We theoretically prove that compared to momentum SGD (MSGD) which is one of the most widely used variants of SGD, SNGM can adopt a larger batch size to converge to the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03f5</mi></math></mjx-assistive-mml></mjx-container>-stationary point with the same computation complexity (total number of gradient computation). Empirical results on deep learning also show that SNGM can achieve the state-of-the-art accuracy with a large batch size.",
+    "title": "Stochastic Normalized Gradient Descent with Momentum for Large Batch Training",
+    "authors": [
+      "Shen-Yi Zhao",
+      "Yin-Peng Xie",
+      "Wu-Jun Li"
+    ],
+    "emails": [
+      "~Wu-Jun_Li1",
+      "~Shen-Yi_Zhao2",
+      "~Yin-Peng_Xie1"
+    ],
+    "rank": 2755
+  },
+  {
+    "url": "https://openreview.net/forum?id=FGvJvxn2wWO",
+    "ratings": [
+      4,
+      5,
+      3,
+      3,
+      4
+    ],
+    "rating": "3.80",
+    "confidences": [
+      5,
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "We present a novel domain adaptation framework that uses morphologic segmentation to translate images from arbitrary input domains (real and synthetic) into a uniform output domain. Our framework is based on an established image-to-image translation pipeline that allows us to first transform the input image into a generalized representation that encodes morphology and semantics \u2013 the edge-plus-segmentation map (EPS) \u2013 which is then transformed into an output domain. Images transformed into the output domain are photo-realistic and free of artifacts that are commonly present across different real (e.g. lens flare, motion blur, etc.) and synthetic (e.g. unrealistic textures, simplified geometry, etc.) data sets. Our goal is to establish a preprocessing step that unifies data from multiple sources into a common representation that facilitates training downstream tasks in computer vision. This way, neural networks for existing tasks can be trained on a larger variety of training data, while they are also less affected by overfitting to specific data sets. We showcase the effectiveness of our approach by qualitatively and quantitatively evaluating our method on four data sets of simulated and real data of urban scenes.",
+    "title": "Domain Adaptation with Morphologic Segmentation",
+    "authors": [
+      "Jonathan Klein",
+      "Soren Pirk",
+      "Dominik Michels"
+    ],
+    "emails": [
+      "~Dominik_Michels1",
+      "~Soren_Pirk2",
+      "~Jonathan_Klein1"
+    ],
+    "rank": 2756
+  },
+  {
+    "url": "https://openreview.net/forum?id=GJkY3ptA3vJ",
+    "ratings": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "rating": "3.80",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Although the self-supervised pre-training of transformer models has resulted in the revolutionizing of natural language processing (NLP) applications and the achievement of state-of-the-art results with regard to various benchmarks, this process is still vulnerable to small and imperceptible permutations originating from legitimate inputs. Intuitively, the representations should be similar in the feature space with subtle input permutations, while large variations occur with different meanings. This motivates us to investigate the learning of robust textual representation in a contrastive manner. However, it is non-trivial to obtain opposing semantic instances for textual samples. In this study, we propose a disentangled contrastive learning method that separately optimizes the uniformity and alignment of representations without negative sampling. Specifically, we introduce the concept of momentum representation consistency to align features and leverage power normalization while conforming the uniformity.  Our experimental results for the NLP benchmarks demonstrate that our approach can obtain better results compared with the baselines, as well as achieve promising improvements with invariance tests and adversarial attacks.",
+    "title": "Towards Robust Textual Representations with Disentangled Contrastive Learning",
+    "authors": [
+      "Ningyu Zhang",
+      "Xiang Chen",
+      "Xin Xie",
+      "Shumin Deng",
+      "Yantao Jia",
+      "Zonggang Yuan",
+      "Huajun Chen"
+    ],
+    "emails": [
+      "~Xin_Xie2",
+      "~Zonggang_Yuan1",
+      "~Huajun_Chen1",
+      "~Shumin_Deng1",
+      "~Xiang_Chen5",
+      "~Yantao_Jia1",
+      "~Ningyu_Zhang1"
+    ],
+    "rank": 2757
+  },
+  {
+    "url": "https://openreview.net/forum?id=xng0HoPDaFN",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3
+    ],
+    "rating": "3.80",
+    "confidences": [
+      2,
+      4,
+      4
+    ],
+    "abstract": "Recently, to deal with the vulnerability to generate examples of CNNs, there are many advanced algorithms that have been proposed. These algorithms focus on modifying global pixels directly with small perturbations, and some work involves modifying local pixels. However, the global attacks have the problem of perturbations\u2019 redundancy and the local attacks are not effective. To overcome this challenge, we achieve a trade-off between the perturbation power and the number of perturbed pixels in this paper. The key idea is to find the feature contributive regions (FCRs) of the images. Furthermore, in order to create an adversarial example similar to the corresponding clean image as much as possible, we redefine a loss function as the objective function of the optimization in this paper and then using gradient descent optimization algorithm to find the efficient perturbations. Various experiments have been carried out on CIFAR-10 and ILSVRC2012 datasets, which show the excellence of this method, and in addition, the FCRs attack shows strong attack ability in both white-box and black-box settings.",
+    "title": "An Adversarial Attack via Feature Contributive Regions",
+    "authors": [
+      "Yaguan Qian",
+      "Jiamin Wang",
+      "Xiang Ling",
+      "Zhaoquan Gu",
+      "Bin Wang",
+      "Chunming Wu"
+    ],
+    "emails": [
+      "zust.edu.cn",
+      "~Zhaoquan_Gu2",
+      "~Jiamin_Wang1",
+      "zju.edu.cn",
+      "gmail.com",
+      "~Xiang_Ling1"
+    ],
+    "rank": 2758
+  },
+  {
+    "url": "https://openreview.net/forum?id=x1uGDeV6ter",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.80",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In an autonomous driving scenario, it is vital to acquire and efficiently process data from various sensors to obtain a complete and robust perspective of the surroundings. Many studies have shown the importance of having radar data in addition to images since radar improves object detection performance. We develop a novel algorithm motivated by the hypothesis that with a limited sampling budget, allocating more sampling budget to areas with the object as opposed to a uniform sampling budget ultimately improves relevant object detection and classification. In order to identify the areas with objects, we develop an algorithm to process the object detection results from the Faster R-CNN object detection algorithm and the previous radar frame and use these as prior information to adaptively allocate more bits to areas in the scene that may contain relevant objects. We use previous radar frame information to mitigate the potential information loss of an object missed by the image or the object detection network. Also, in our algorithm, the error of missing relevant information in the current frame due to the limited budget sampling of the previous radar frame did not propagate across frames. We also develop an end-to-end transformer-based 2D object detection network using the NuScenes radar and image data. Finally, we compare the performance of our algorithm against that of standard CS and adaptive CS using radar on the Oxford Radar RobotCar dataset.",
+    "title": "Adaptive Automotive Radar data Acquisition",
+    "authors": [
+      "Madhumitha Sakthi",
+      "Ahmed Tewfik"
+    ],
+    "emails": [
+      "~Madhumitha_Sakthi1",
+      "~Ahmed_Tewfik1"
+    ],
+    "rank": 2759
+  },
+  {
+    "url": "https://openreview.net/forum?id=30SS5VjvhrZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.80",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Uncertainty evaluation is a core technique when deep neural networks (DNNs) are used in real-world problems. In practical applications, we often encounter unexpected samples that have not seen in the training process. Not only achieving the high-prediction accuracy but also detecting uncertain data is significant for safety-critical systems. In statistics and machine learning, Bayesian inference has been exploited for uncertainty evaluation. The Bayesian neural networks (BNNs) have recently attracted considerable attention in this context, as the DNN trained using dropout is interpreted as a Bayesian method. Based on this interpretation, several methods to calculate the Bayes predictive distribution for DNNs have been developed. Though the Monte-Carlo method called MC dropout is a popular method for uncertainty evaluation, it requires a number of repeated feed-forward calculations of DNNs with randomly sampled weight parameters. To overcome the computational issue, we propose a sampling-free method to evaluate uncertainty. Our method converts a neural network trained using the dropout to the corresponding Bayesian neural network with variance propagation. Our method is available not only to feed-forward NNs but also to recurrent NNs including LSTM. We report the computational efficiency and statistical reliability of our method in numerical experiments of the language modeling using RNNs, and the out-of-distribution detection with DNNs. ",
+    "title": "Bayesian Neural Networks with Variance Propagation for Uncertainty Evaluation",
+    "authors": [
+      "Yuki Mae",
+      "Wataru Kumagai",
+      "Takafumi Kanamori"
+    ],
+    "emails": [
+      "~Wataru_Kumagai2",
+      "~Yuki_Mae1",
+      "~Takafumi_Kanamori1"
+    ],
+    "rank": 2760
+  },
+  {
+    "url": "https://openreview.net/forum?id=udqDDCjdjs5",
+    "ratings": [
+      4,
+      2,
+      5,
+      4
+    ],
+    "rating": "3.79",
+    "confidences": [
+      2,
+      4,
+      5,
+      3
+    ],
+    "abstract": "This paper provides a unified analytic framework integrating expressions of several variants of convolutional neural networks and wavelet filter banks. The expressions are derived recursively, from dowstream to upstream layers, for the sequences of features returned at the nodes of a general form of network architecture. The inspiring framework for the derivation of these expressions is that of the so-called M-band convolution filter banks. In addition with the inter-layer inter-node expressions, activation sequences of convolutional neural networks  also are mathematically described by suitable algebraic path representations. ",
+    "title": "Unified analytic forms for Convolutional Neural Networks and Wavelet Filter Banks",
+    "authors": [
+      "Abdourrahmane M ATTO"
+    ],
+    "emails": [
+      "~Abdourrahmane_M_ATTO1"
+    ],
+    "rank": 2761
+  },
+  {
+    "url": "https://openreview.net/forum?id=LJRsOvDJ4gP",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.79",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "We propose a novel technique for sampling representatives from a large, unsupervised dataset. The approach is based on the concept of {\\em self-rank}, defined as the minimum number of samples needed to reconstruct all samples with an accuracy proportional to the rank-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> approximation. As the exact computation of self-rank requires a computationally expensive combinatorial search, we propose an efficient algorithm that jointly estimates self-rank and selects the optimal samples with high accuracy. A theoretical upper bound is derived that reaches the tightest bound for two asymptotic cases. The best approximation ratio for self-representative low-rank approximation was presented in ICML 2017~\\cite{Chierichetti-icml-2017}, which was further improved by the bound <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msqrt><mjx-sqrt><mjx-surd><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c221A\"></mjx-c></mjx-mo></mjx-surd><mjx-box style=\"padding-top: 0.125em;\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-box></mjx-sqrt></mjx-msqrt></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msqrt><mn>1</mn><mo>+</mo><mi>K</mi></msqrt></math></mjx-assistive-mml></mjx-container> reported in~NeurIPS 2019~\\cite{dan2019optimal}. Both of these bounds depend solely on the number of selected samples. In this paper, for the first time, we present an adaptive approximation ratio depending on spectral properties of the original dataset, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle size=\"sm\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-b mjx-i\"><mjx-c class=\"mjx-c1D468 TEX-BI\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c2208\"></mjx-c></mjx-mo><mjx-msup space=\"4\"><mjx-texatom texclass=\"ORD\"><mjx-mi class=\"mjx-ds mjx-b\"><mjx-c class=\"mjx-c211D TEX-A\"></mjx-c></mjx-mi></mjx-texatom><mjx-script style=\"vertical-align: 0.41em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-texatom></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle mathsize=\"0.85em\"><mrow><mi mathvariant=\"bold-italic\">A</mi><mo>\u2208</mo><msup><mrow><mi mathvariant=\"double-struck\">R</mi></mrow><mrow><mi>N</mi><mo>\u00d7</mo><mi>M</mi></mrow></msup></mrow></mstyle></math></mjx-assistive-mml></mjx-container>. In particular, our performance bound is proportional to the condition number <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-b mjx-i\"><mjx-c class=\"mjx-c1D468 TEX-BI\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03ba</mi><mo stretchy=\"false\">(</mo><mi mathvariant=\"bold-italic\">A</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>. Our derived approximation ratio is expressed as <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D705 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-b mjx-i\"><mjx-c class=\"mjx-c1D468 TEX-BI\"></mjx-c></mjx-mi><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D441 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn><mo>+</mo><mo stretchy=\"false\">(</mo><mi>\u03ba</mi><mo stretchy=\"false\">(</mo><mi mathvariant=\"bold-italic\">A</mi><msup><mo stretchy=\"false\">)</mo><mn>2</mn></msup><mo>\u2212</mo><mn>1</mn><mo stretchy=\"false\">)</mo><mrow><mo>/</mo></mrow><mo stretchy=\"false\">(</mo><mi>N</mi><mo>\u2212</mo><mi>K</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> which approaches <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>1</mn></math></mjx-assistive-mml></mjx-container> in two asymptotic cases.\nIn addition to evaluating the proposed algorithm on a synthetic dataset, we show that the proposed sampling scheme can be utilized in real-world applications such as graph node sampling for optimizing the shortest path criterion, and learning a classifier with sampled data.",
+    "title": "Asymptotic Optimality of Self-Representative Low-Rank Approximation and Its Applications",
+    "authors": [
+      "Saeed Vahidian",
+      "mohsen Joneidi",
+      "Ashkan Esmaeili",
+      "Siavash Khodadadeh",
+      "Sharare zehtabian",
+      "Ladislau Boloni",
+      "Nazanin Rahnavard",
+      "Bill Lin",
+      "Mubarak Shah"
+    ],
+    "emails": [
+      "~Mubarak_Shah3",
+      "~Bill_Lin1",
+      "~Siavash_Khodadadeh1",
+      "knights.ucf.edu",
+      "~Saeed_Vahidian1",
+      "~Ladislau_Boloni1",
+      "~Nazanin_Rahnavard1",
+      "ucf.edu"
+    ],
+    "rank": 2762
+  },
+  {
+    "url": "https://openreview.net/forum?id=bd66LuDPPFh",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      6,
+      1,
+      6
+    ],
+    "rating": "3.79",
+    "confidences": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "abstract": "Label smoothing regularization (LSR) has a great success in training  deep neural networks by stochastic algorithms such as stochastic gradient descent and its variants. However, the theoretical understanding of its power from the view of optimization is still rare. This study opens the door to a deep understanding of LSR by initiating the analysis. In this paper, we analyze the convergence behaviors of stochastic gradient descent with label smoothing regularization for solving non-convex problems and show that an appropriate LSR can help to speed up the convergence by reducing the variance. More interestingly, we proposed a simple yet effective strategy, namely Two-Stage LAbel smoothing algorithm (TSLA), that uses LSR in the early training epochs and drops it off in the later training epochs. We observe from the improved convergence result of TSLA that it benefits from LSR in the first stage and essentially converges faster in the second stage. To the best of our knowledge, this is the first work for understanding the power of LSR via establishing convergence complexity of stochastic methods with LSR in non-convex optimization.  We empirically demonstrate the effectiveness of the proposed method in comparison with baselines on training ResNet models over benchmark data sets.",
+    "title": "Towards Understanding Label Smoothing",
+    "authors": [
+      "Yi Xu",
+      "Yuanhong Xu",
+      "Qi Qian",
+      "Li Hao",
+      "Rong Jin"
+    ],
+    "emails": [
+      "~Rong_Jin1",
+      "~Yuanhong_Xu1",
+      "~Yi_Xu8",
+      "~Qi_Qian1",
+      "~Li_Hao1"
+    ],
+    "rank": 2763
+  },
+  {
+    "url": "https://openreview.net/forum?id=jcN7a3yZeQc",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "rating": "3.79",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Q-learning with value function approximation may have the poor performance because of overestimation bias and imprecise estimate. Specifically, overestimation bias is from the maximum operator over noise estimate, which is exaggerated using the estimate of a subsequent state. Inspired by the recent advance of deep reinforcement learning and Double Q-learning, we introduce the decorrelated double Q-learning (D2Q). Specifically, we introduce the decorrelated regularization item to reduce the correlation between value function approximators, which can lead to less biased estimation and low variance. The experimental results on a suite of MuJoCo continuous control tasks demonstrate that our decorrelated double Q-learning can effectively improve the performance.",
+    "title": "Decorrelated Double Q-learning",
+    "authors": [
+      "GANG CHEN"
+    ],
+    "emails": [
+      "~GANG_CHEN1"
+    ],
+    "rank": 2764
+  },
+  {
+    "url": "https://openreview.net/forum?id=oLltLS5F9R",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.78",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Graph Neural Networks (GNNs) have emerged as a useful paradigm to process graph-structured data. Usually, GNNs are stacked to multiple layers and the node representations in each layer are computed through propagating and aggre- gating the neighboring node features with respect to the graph. To effectively train a GNN with multiple layers, some normalization techniques are necessary. Though the existing normalization techniques have been shown to accelerate train- ing GNNs, the structure information on the graph is ignored yet. In this paper, we propose two graph-aware normalization methods to effectively train GNNs. Then, by taking into account that normalization methods for GNNs are highly task-relevant and it is hard to know in advance which normalization method is the best, we propose to learn attentive graph normalization by optimizing a weighted combination of multiple graph normalization methods at different scales. By op- timizing the combination weights, we can automatically select the best or the best combination of multiple normalization methods for a specific task. We con- duct extensive experiments on benchmark datasets for different tasks and confirm that the graph-aware normalization methods lead to promising results and that the learned weights suggest the more appropriate normalization methods for specific task",
+    "title": "Learning Graph Normalization for Graph Neural Networks",
+    "authors": [
+      "Yihao Chen",
+      "Xin Tang",
+      "Xianbiao Qi",
+      "Chun-Guang Li",
+      "Rong Xiao"
+    ],
+    "emails": [
+      "~Rong_Xiao3",
+      "~Xianbiao_Qi2",
+      "~Yihao_Chen1",
+      "gmail.com",
+      "~Chun-Guang_Li3"
+    ],
+    "rank": 2765
+  },
+  {
+    "url": "https://openreview.net/forum?id=BL4FZG2bCR7",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.78",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "A single gene can encode for different protein versions through a process called alternative splicing. Since proteins play major roles in cellular functions, aberrant splicing profiles can result in a variety of diseases, including cancers. Alternative splicing is determined by the gene's primary sequence and other regulatory factors such as RNA-binding protein levels. With these as input, we formulate the prediction of RNA splicing as a regression task and build a new training dataset (CAPD) to benchmark learned models. We propose discrete compositional energy network (DCEN) which leverages the compositionality of key components to approach this task. In the case of alternative splicing prediction, DCEN models mRNA transcript probabilities through its constituent splice junctions' energy values. These transcript probabilities are subsequently mapped to relative abundance values of key nucleotides and trained with ground-truth experimental measurements. Through our experiments on CAPD, we show that DCEN outperforms baselines and its ablation variants.",
+    "title": "RNA Alternative Splicing Prediction with Discrete Compositional Energy Network",
+    "authors": [
+      "Alvin Chan",
+      "Anna Korsakova",
+      "Yew-Soon Ong",
+      "Fernaldo Richtia Winnerdy",
+      "Kah Wai Lim",
+      "Anh Tuan Phan"
+    ],
+    "emails": [
+      "e.ntu.edu.sg",
+      "~Alvin_Chan1",
+      "ntu.edu.sg",
+      "~Yew-Soon_Ong1"
+    ],
+    "rank": 2766
+  },
+  {
+    "url": "https://openreview.net/forum?id=_QB8Am3Fxkb",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.78",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Batch normalization (BatchNorm) is an effective yet poorly understood technique for neural network optimization. It is often assumed that the degradation in BatchNorm performance to smaller batch sizes stems from it having to estimate layer statistics using smaller sample sizes. However, recently, Ghost normalization (GhostNorm), a variant of BatchNorm that explicitly uses smaller sample sizes for normalization, has been shown to improve upon BatchNorm in some datasets. Our contributions are: (i) three types of GhostNorm implementations are described, two of which employ BatchNorm as the underlying normalization technique, (ii) we uncover a source of regularization that is unique to GhostNorm, and not simply an extension from BatchNorm, and visualise the difference in their loss landscapes, (iii) we extend GhostNorm and introduce a new type of normalization layer called Sequential Normalization (SeqNorm), (iv) we compare both GhostNorm and SeqNorm against BatchNorm alone as well as with other regularisation techniques, (v) for both GhostNorm and SeqNorm, we report superior performance over state-of-the-art methodologies on CIFAR--10, CIFAR--100, and ImageNet data sets.",
+    "title": "Sequential Normalization: an improvement over Ghost Normalization",
+    "authors": [
+      "Neofytos Dimitriou",
+      "Ognjen Arandjelovic"
+    ],
+    "emails": [
+      "~Ognjen_Arandjelovic2",
+      "~Neofytos_Dimitriou1"
+    ],
+    "rank": 2767
+  },
+  {
+    "url": "https://openreview.net/forum?id=HclUGWwAVE",
+    "ratings": [
+      6,
+      3,
+      3,
+      4
+    ],
+    "rating": "3.78",
+    "confidences": [
+      3,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Deep neural networks have achieved striking performance when evaluated on testing data which share the same distribution with training ones, but can significantly fail otherwise. Therefore, eliminating the impact of distribution shifts between training and testing data is of paramount importance for building performance-promising deep models. Conventional methods (e.g. domain adaptation/generalization) assume either the availability of testing data or the known heterogeneity of training data (e.g. domain labels). In this paper, we consider a more challenging case where neither of the above information is available during the training phase. We propose to address this problem by removing the dependencies between features via reweighting training samples, which results in a more balanced distribution and helps deep models get rid of spurious correlations and, in turn, concentrate more on the true connection between features and labels. We conduct extensive experiments on object recognition benchmarks including PACS, VLCS, MNIST-M, and NICO which support the evaluation of generalization ability. The experimental results clearly demonstrate the effectiveness of the proposed method compared with state-of-the-art counterparts.",
+    "title": "Sample Balancing for Improving Generalization under Distribution Shifts",
+    "authors": [
+      "Xingxuan Zhang",
+      "Peng Cui",
+      "Renzhe Xu",
+      "Yue He",
+      "Linjun Zhou",
+      "Zheyan Shen"
+    ],
+    "emails": [
+      "~Yue_He2",
+      "~Zheyan_Shen1",
+      "~Linjun_Zhou1",
+      "~Peng_Cui1",
+      "~Renzhe_Xu1",
+      "~Xingxuan_Zhang1"
+    ],
+    "rank": 2768
+  },
+  {
+    "url": "https://openreview.net/forum?id=5fJ0qcwBNr0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.78",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "It is widely accepted  that vanishing and exploding gradient values are the main reason behind the difficulty of deep network training.\nIn this work, we take a further step to understand the optimization of deep networks and find that both gradient correlations and gradient values have strong impacts on model training. \nInspired by our new finding, we explore a simple yet effective network architecture search (NAS) approach that leverages gradient correlation and gradient values to find well-performing architectures. To be specific, we first  formulate these two terms into a unified gradient-based kernel and then select architectures with the largest kernels at initialization as the final networks. \nThe new approach replaces the expensive  ``train-then-test'' evaluation paradigm  with a new lightweight function according to the gradient-based kernel at initialization.\nExperiments show that our approach achieves competitive results with orders of magnitude faster than ``train-then-test'' paradigms on image classification tasks. Furthermore, the extremely low search cost enables its wide applications. It also obtains performance improvements on two text classification tasks.",
+    "title": "A Gradient-based Kernel Approach for Efficient Network Architecture Search",
+    "authors": [
+      "Jingjing Xu",
+      "Liang Zhao",
+      "Junyang Lin",
+      "Xu Sun",
+      "Hongxia Yang"
+    ],
+    "emails": [
+      "~Xu_Sun1",
+      "~Hongxia_Yang2",
+      "alibaba-inc.com",
+      "~Jingjing_Xu1",
+      "~Liang_Zhao8"
+    ],
+    "rank": 2769
+  },
+  {
+    "url": "https://openreview.net/forum?id=u8APpiJX3u",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.77",
+    "confidences": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "abstract": "Deep neural networks have usually to be compressed and accelerated for their usage in low-power, e.g. mobile, devices. Common requirements are high accuracy, high throughput, low latency, and a small memory footprint. A good trade-off between accuracy and latency has been shown by networks comprising multiple intermediate outputs. In this study, we introduce a multi-output network that has a tiny memory footprint in terms of its computational graph, which allows its execution on novel, massively-parallel hardware accelerators designed for extremely high throughput. To this end, the graph is designed to contain loops by iteratively executing a single network building block. These so-called iterative neural networks enable stateof-the-art results for semantic segmentation on the CamVid and Cityscapes datasets that are especially demanding in terms of computational resources. In ablation studies, the improvement of network training by intermediate network outputs as well as the trade-off between weight sharing over iterations and the network size are investigated.",
+    "title": "ItNet: iterative neural networks for fast and efficient anytime prediction",
+    "authors": [
+      "Thomas Pfeil"
+    ],
+    "emails": [
+      "~Thomas_Pfeil1"
+    ],
+    "rank": 2770
+  },
+  {
+    "url": "https://openreview.net/forum?id=rAq9n49mUGl",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.77",
+    "confidences": [
+      3,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Bayesian parameter estimation methods are robust techniques for quantifying properties of physical systems which cannot be observed directly. In estimating such parameters, one first requires a physics model of the phenomenon to be studied. Often, such a model follows a series of assumptions to make parameter inference feasible. When simplified models are used for inference, however, systematic differences between model predictions and observed data may propagate throughout the parameter estimation process, biasing inference results. In this work, we use generative adversarial networks (GANs) based on the maximum mean discrepancy (MMD) to learn small stochastic corrections to physics models in order to minimize inference bias. We further propose a hybrid training procedure utilizing both the MMD and the standard GAN objective functionals. We demonstrate the ability to learn stochastic model corrections and eliminate inference bias on a toy problem wherein the true data distribution is known. Subsequently, we apply these methods to a mildly ill-posed inference problem in magnetic resonance imaging (MRI), showing improvement over an established inference method. Finally, because 3D MRI images often contain millions of voxels which would each require parameter inference, we train a conditional variational autoencoder (CVAE) network on the corrected MRI physics model to perform fast inference and make this approach practical.\n",
+    "title": "Using MMD GANs to correct physics models and improve Bayesian parameter estimation",
+    "authors": [
+      "Jonathan Doucette",
+      "Christian Kames",
+      "Alexander Rauscher"
+    ],
+    "emails": [
+      "~Alexander_Rauscher1",
+      "~Jonathan_Doucette1",
+      "~Christian_Kames1"
+    ],
+    "rank": 2771
+  },
+  {
+    "url": "https://openreview.net/forum?id=dwI2LBdpszh",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.76",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "This paper investigates the design of time and memory efficient graph convolutional networks (GCNs). State-of-the-art GCNs adopt <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container>-nearest neighbor (KNN) searches for local feature aggregation and feature extraction operations from layer to layer. Based on the mathematical analysis of existing graph convolution operations, we articulate the following two card shuffling hypotheses. (1) Shuffling the nearest neighbor selection for KNN searches in a multi-layered GCN approximately preserves the local geometric structures of 3D representations. (2) Shuffling the order of local feature aggregation and feature extraction leads to equivalent or similar composite operations for GCNs. The two hypotheses shed light on two possible directions of accelerating modern GCNs. That is, reasonable shuffling of the cards (neighbor selection or local feature operations) can significantly improve time and memory efficiency. A series of experiments show that the network architectures designed based on the proposed card shuffling hypotheses decrease both the time and memory consumption significantly (e.g., about 50% for point cloud classification and semantic segmentation), while maintaining comparable accuracy, on several important tasks in 3D deep learning, i.e., 3D classification, part segmentation, semantic segmentation, and mesh generation.",
+    "title": "The Card Shuffling Hypotheses: Building a Time and Memory Efficient Graph Convolutional Network",
+    "authors": [
+      "Yawei Li",
+      "He Chen",
+      "Zhaopeng Cui",
+      "Radu Timofte",
+      "Marc Pollefeys",
+      "Gregory Chirikjian",
+      "Luc Van Gool"
+    ],
+    "emails": [
+      "~Luc_Van_Gool1",
+      "~Yawei_Li1",
+      "~Radu_Timofte1",
+      "~Gregory_Chirikjian1",
+      "~Marc_Pollefeys2",
+      "~Zhaopeng_Cui1",
+      "~He_Chen1"
+    ],
+    "rank": 2772
+  },
+  {
+    "url": "https://openreview.net/forum?id=CJmMqnXthgX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.76",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Graph neural networks and graph kernels have achieved great success in solving machine learning problems on graphs. Recently, there has been considerable interest in determining the expressive power mainly of graph neural networks and of graph kernels, to a lesser extent.  Most studies have focused on the ability of these approaches to distinguish non-isomorphic graphs or to identify specific graph properties. However, there is often a need for algorithms whose produced graph representations can accurately capture similarity/distance of graphs. This paper studies the expressive power of graph neural networks and graph kernels from an empirical perspective. Specifically, we compare the graph representations and similarities produced by these algorithms against those generated by a well-accepted, but intractable graph similarity function. We also investigate the impact of node attributes on the performance of the different models and kernels. Our results reveal interesting findings. For instance, we find that theoretically more powerful models do not necessarily yield higher-quality representations, while graph kernels are shown to be very competitive with graph neural networks.",
+    "title": "An Empirical Study of the Expressiveness of Graph Kernels and Graph Neural Networks",
+    "authors": [
+      "Giannis Nikolentzos",
+      "George Panagopoulos",
+      "Michalis Vazirgiannis"
+    ],
+    "emails": [
+      "~Giannis_Nikolentzos1",
+      "~Michalis_Vazirgiannis1",
+      "polytechnique.edu"
+    ],
+    "rank": 2773
+  },
+  {
+    "url": "https://openreview.net/forum?id=B4SHgqe1kvX",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.76",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Word representation is a fundamental component in neural language understanding models. Recently, pre-trained language models (PrLMs) offer a new performant method of contextualized word representations by leveraging the sequence-level context for modeling. Although the PrLMs generally give more accurate contextualized word representations than non-contextualized models do, they are still subject to a sequence of text contexts without diverse hints for word representation from multimodality. This paper thus proposes a visual representation method to explicitly enhance conventional word embedding with multiple-aspect senses from visual guidance. In detail, we build a small-scale word-image dictionary from a multimodal seed dataset where each word corresponds to diverse related images. The texts and paired images are encoded in parallel, followed by an attention layer to integrate the multimodal representations. We show that the method substantially improves the accuracy of disambiguation. Experiments on 12 natural language understanding and machine translation tasks further verify the effectiveness and the generalization capability of the proposed approach.",
+    "title": "Accurate Word Representations with Universal Visual Guidance",
+    "authors": [
+      "Haojie Yu",
+      "Zhuosheng Zhang",
+      "hai zhao",
+      "Rui Wang",
+      "Masao Utiyama"
+    ],
+    "emails": [
+      "sjtu.edu.cn",
+      "~hai_zhao1",
+      "~Rui_Wang10",
+      "~Zhuosheng_Zhang1",
+      "~Masao_Utiyama2"
+    ],
+    "rank": 2774
+  },
+  {
+    "url": "https://openreview.net/forum?id=GWMFRQXSbw",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.76",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Adaptive gradient methods such as RMSProp and Adam use exponential moving estimate of the squared gradient to compute coordinate-wise adaptive step sizes, achieving better convergence than SGD in face of ill-conditioned or noisy objectives. However, Adam can have undesirable convergence behavior due to unstable or extreme adaptive learning rates. Methods such as AMSGrad and AdaBound have been proposed to stabilize the adaptive learning rates of Adam in the later stage of training, but they do not outperform Adam in some practical tasks such as training Transformers. In this paper, we propose an adaptive learning rate principle, in which the running mean of squared gradient is replaced by a weighted mean, with weights chosen to maximize the estimated variance of each coordinate. This gives a worst-case estimate for the local gradient variance, taking smaller steps when large curvatures or noisy gradients are present, which leads to more desirable convergence behavior than Adam.  We prove the proposed algorithm converges under mild assumptions for nonconvex stochastic optimization problems, and demonstrate the improved efficacy of our adaptive averaging approach on image classification, machine translation and natural language understanding tasks. Moreover, our method overcomes the non-convergence issue of Adam in BERT pretraining at large batch sizes, while achieving better test performance than \\lamb~in the same setting.",
+    "title": "Adaptive Learning Rates with Maximum Variation Averaging",
+    "authors": [
+      "Chen Zhu",
+      "Yu Cheng",
+      "Zhe Gan",
+      "Furong Huang",
+      "Jingjing Liu",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Jingjing_Liu2",
+      "~Furong_Huang1",
+      "~Chen_Zhu2",
+      "~Yu_Cheng1",
+      "~Zhe_Gan1",
+      "~Tom_Goldstein1"
+    ],
+    "rank": 2775
+  },
+  {
+    "url": "https://openreview.net/forum?id=BG9hcZ-wIEq",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep neural networks are widely adopted powerful tools for perceptual tasks. However, recent research indicated that they are easily fooled by adversarial examples, which are produced by adding imperceptible adversarial perturbations to clean examples. In this paper, we utilize the steganalysis rich model (SRM) to generate noise feature maps, and combine them with RGB images to discover the difference between adversarial examples and clean examples. In particular, we propose a two-stream pseudo-siamese network and train it end-to-end to detect adversarial examples. Our approach fuses the subtle difference in RGB images with the noise inconsistency in noise features. The proposed method has strong detection capability and transferability, and can be combined with any classifier without modifying its architecture or training procedure. Our extensive empirical experiments show that, compared with the state-of-the-art detection methods, the proposed method achieves excellent performance in distinguishing adversarial samples generated by popular attack methods on different real datasets. Moreover, our method has good generalization, it trained by a specific adversary can generalize to other adversaries.",
+    "title": "Detecting Adversarial Examples by Additional Evidence from Noise Domain",
+    "authors": [
+      "Song Gao",
+      "Shui Yu",
+      "Shaowen Yao"
+    ],
+    "emails": [
+      "ynu.edu.cn",
+      "~Shui_Yu1",
+      "~Song_Gao2"
+    ],
+    "rank": 2776
+  },
+  {
+    "url": "https://openreview.net/forum?id=8znruLfUZnT",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4
+    ],
+    "rating": "3.75",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Sparse representation via a learned dictionary is a powerful prior for natural images. In recent years, unrolled sparse coding algorithms (e.g. LISTA) have proven to be useful for constructing interpretable deep-learning networks that perform on par with state-of-the-art models on image-restoration tasks. In this study we are concerned with extending the work of such convolutional dictionary learning (CDL) models. We propose to construct strided convolutional dictionaries with a single analytic low-pass filter and a set of learned filters regularized to occupy the complementary frequency space. By doing so, we address the necessary modeling assumptions of natural images with respect to convolutional sparse coding and reduce the mutual coherence and redundancy of the learned filters. We show improved denoising performance at reduced computational complexity when compared to other CDL methods, and competitive results when compared to popular deep-learning models. We further propose to parameterize the thresholds in the soft-thresholding operator of LISTA to be proportional to the estimated noise-variance from an input image. We demonstrate that this parameterization enhances robustness to noise-level mismatch between training and inference.",
+    "title": "Frequency Regularized Deep Convolutional Dictionary Learning and Application to Blind Denoising",
+    "authors": [
+      "Nikola Pavle Janjusevic",
+      "Amirhossein Khalilian-Gourtani",
+      "Yao Wang"
+    ],
+    "emails": [
+      "~Yao_Wang2",
+      "nyu.edu",
+      "~Nikola_Pavle_Janjusevic1"
+    ],
+    "rank": 2777
+  },
+  {
+    "url": "https://openreview.net/forum?id=28AnM10CHyO",
+    "ratings": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "rating": "3.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Deep networks usually require a massive amount of labeled data for their training. Yet, such data may include some mistakes in the labels. Interestingly, networks have been shown to be robust to such errors. This work uses a spectral (Fourier) analysis of their learned mapping to provide an explanation for their robustness. In particular, we relate the smoothness regularization that usually exists in conventional training to attenuation of high frequencies, which mainly characterize noise. By using a connection between the smoothness and the spectral norm of the network weights, we suggest that one may further improve robustness via spectral normalization. Empirical experiments validate our claims and show the advantage of this normalization for classification with label noise.",
+    "title": "A Spectral Perspective of Neural Networks Robustness to Label Noise",
+    "authors": [
+      "Oshrat Bar",
+      "Amnon Drory",
+      "Raja Giryes"
+    ],
+    "emails": [
+      "~Raja_Giryes1",
+      "~Amnon_Drory1",
+      "~Oshrat_Bar1"
+    ],
+    "rank": 2778
+  },
+  {
+    "url": "https://openreview.net/forum?id=SRzz6RtOdKR",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4
+    ],
+    "rating": "3.75",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "In model learning, when the training dataset on which the parameters are optimized and the testing dataset on which the model is evaluated are not sampled from identical distributions, we say that the datasets are misaligned. It is well-known that this misalignment can negatively impact model performance. A common source of misalignment is that the inputs are sampled from different distributions. Another source for this misalignment is that the label generating process used to create the training dataset is imperfect. In this work, we consider this setting and additionally assume that the label generating process is able to provide us with a quantity for the role of each label in the misalignment between the datasets, which we consider to be privileged information. Specifically, we consider the task of regression with labels corrupted by heteroscedastic noise and we assume that we have access to an estimate of the variance over each sample. We propose a general approach to include this privileged information in the loss function together with dataset statistics inferred from the mini-batch to mitigate the impact of the dataset misalignment. Subsequently, we propose a specific algorithm for the heteroscedastic regression case, called Batch Inverse-Variance weighting, which adapts inverse-variance weighting for linear regression to the case of neural network function approximation.  We demonstrate that this approach achieves a significant improvement in network training performances compared to baselines when confronted with high, input-independent noise.",
+    "title": "Batch Inverse-Variance Weighting: Deep Heteroscedastic Regression",
+    "authors": [
+      "Vincent Mai",
+      "Waleed Khamies",
+      "Liam Paull"
+    ],
+    "emails": [
+      "~Vincent_Mai1",
+      "~Waleed_Khamies1",
+      "~Liam_Paull1"
+    ],
+    "rank": 2779
+  },
+  {
+    "url": "https://openreview.net/forum?id=FoM-RnF6SNe",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.75",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Reinforcement learning has enabled agents to solve challenging control tasks from raw image inputs. However, manually crafting reward functions can be time consuming, expensive, and prone to human error. Competing objectives have been proposed for agents to learn without external supervision, such as artificial input entropy, information gain, and empowerment. Estimating these objectives can be challenging and it remains unclear how well they reflect task rewards or human behavior. We study these objectives across seven agents and three Atari games. Retrospectively computing the objectives from the agent's lifetime of experience simplifies accurate estimation. We find that all three objectives correlate more strongly with a human behavior similarity metric than with task reward. Moreover, input entropy and information gain both correlate more strongly with human similarity than task reward does.",
+    "title": "Evaluating Agents Without Rewards",
+    "authors": [
+      "Brendon Matusch",
+      "Jimmy Ba",
+      "Danijar Hafner"
+    ],
+    "emails": [
+      "~Danijar_Hafner1",
+      "~Jimmy_Ba1",
+      "~Brendon_Matusch1"
+    ],
+    "rank": 2780
+  },
+  {
+    "url": "https://openreview.net/forum?id=kmN6SQIjk-r",
+    "ratings": [
+      6,
+      5,
+      2,
+      3
+    ],
+    "rating": "3.75",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Self-supervised approaches such as Momentum Contrast (MoCo) can leverage unlabeled data to produce pretrained models for subsequent fine-tuning on labeled data. While MoCo has demonstrated promising results on natural image classification tasks, its application to medical imaging tasks like chest X-ray interpretation has been limited. Chest X-ray interpretation is fundamentally different from natural image classification in ways that may limit the applicability of self-supervised approaches, including that (1) classification depends on differences in a small number of pixels, (2) X-rays are large and grayscale, (3) there are far fewer unlabeled chest X-ray images than natural images. In this work, we investigate whether MoCo-pretraining leads to better representations or initializations for chest X-ray interpretation. We conduct MoCo-pretraining on CheXpert, a large labeled dataset of X-rays, followed by supervised fine-tuning experiments on the pleural effusion task. Using 0.1% of labeled training data, we find that a linear model trained on MoCo-pretrained representations outperforms one trained on representations without MoCo-pretraining by an AUC of 0.096 (95% CI 0.061, 0.130), indicating that MoCo-pretrained representations are of higher quality. Furthermore, a model fine-tuned end-to-end with MoCo-pretraining outperforms its non-MoCo-pretrained counterpart by an AUC of 0.037 (95% CI 0.015, 0.062) with the 0.1% label fraction. These AUC improvements are observed for all label fractions for both the linear model and an end-to-end fine-tuned model with the greater improvements for smaller label fractions. Finally, we observe similar results on a small, target chest X-ray dataset (Shenzhen dataset for tuberculosis) with MoCo-pretraining done on the source dataset (CheXpert), which suggests that pretraining on unlabeled X-rays can provide transfer learning benefits for a target task. Our study demonstrates that MoCo-pretraining provides high-quality representations and transferable initializations for chest X-ray interpretation. ",
+    "title": "MoCo-Pretraining Improves Representations and Transferability of Chest X-ray Models",
+    "authors": [
+      "Hari Sowrirajan",
+      "Jing Bo Yang",
+      "Andrew Y. Ng",
+      "Pranav Rajpurkar"
+    ],
+    "emails": [
+      "~Hari_Sowrirajan1",
+      "~Andrew_Y._Ng1",
+      "~Pranav_Rajpurkar1",
+      "~Jing_Bo_Yang1"
+    ],
+    "rank": 2781
+  },
+  {
+    "url": "https://openreview.net/forum?id=PmVfnB0nkqr",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "rating": "3.75",
+    "confidences": [
+      4,
+      4,
+      1,
+      3
+    ],
+    "abstract": "We propose a method for autonomously learning an object-centric representation of a continuous and high-dimensional environment that is suitable for planning. Such representations can immediately be transferred between tasks that share the same types of objects, resulting in agents that require fewer samples to learn a model of a new task. We first demonstrate our approach on a simple domain where the agent learns a compact, lifted representation that generalises across objects. We then apply it to a series of Minecraft tasks to learn object-centric representations, including object types\u2014directly from pixel data\u2014that can be leveraged to solve new tasks quickly. The resulting learned representations enable the use of a task-level planner, resulting in an agent capable of forming complex, long-term plans with considerably fewer environment interactions.",
+    "title": "Autonomous Learning of Object-Centric Abstractions for High-Level Planning",
+    "authors": [
+      "Steven James",
+      "Benjamin Rosman",
+      "George Konidaris"
+    ],
+    "emails": [
+      "~Benjamin_Rosman1",
+      "~George_Konidaris1",
+      "~Steven_James1"
+    ],
+    "rank": 2782
+  },
+  {
+    "url": "https://openreview.net/forum?id=REKvFYIgwz9",
+    "ratings": [
+      5,
+      4,
+      4,
+      2
+    ],
+    "rating": "3.75",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Optimal stopping is the problem of deciding the right time at which to take a particular action in a stochastic system, in order to maximize an expected reward. It has many applications in areas such as finance, healthcare, and statistics. In this paper, we employ deep Reinforcement Learning (RL) to learn optimal stopping policies in two financial engineering applications: namely option pricing, and optimal option exercise. We present for the first time a comprehensive empirical evaluation of the quality of optimal stopping policies identified by three state of the art deep RL algorithms: double deep Q-learning (DDQN), categorical distributional RL (C51), and Implicit Quantile Networks (IQN). In the case of option pricing, our findings indicate that in a theoretical Black-Schole environment, IQN successfully identifies nearly optimal prices. On the other hand, it is slightly outperformed by C51 when confronted to real stock data movements in a put option exercise problem that involves assets from the S&amp;P500 index. More importantly, the C51 algorithm is able to identify an optimal stopping policy that achieves 8% more out-of-sample returns than the best of four natural benchmark policies. We conclude with a discussion of our findings which should pave the way for relevant future research.",
+    "title": "Deep Reinforcement Learning for Optimal Stopping with Application in Financial Engineering",
+    "authors": [
+      "Abderrahim Fathan",
+      "Erick Delage"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Erick_Delage2"
+    ],
+    "rank": 2783
+  },
+  {
+    "url": "https://openreview.net/forum?id=2OcEd8jSvR",
+    "ratings": [
+      5,
+      3,
+      5,
+      3,
+      3
+    ],
+    "rating": "3.74",
+    "confidences": [
+      3,
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "A new model of generative adversarial networks for time series based on Euler scheme and Wasserstein distances including Sinkhorn divergence is proposed. Euler scheme improves stability of learning, provides meaningful learning parameters such as drift and volatility while allowing the representation of a large class of processes. We test our Euler GAN generations with usual Monte Carlo simulations in one-dimension and in a multi-dimensional case. We show how the proposed methodology can be combined with transfer learning to include the latest historical dataset features. The approach is tested on financial indicators computation on S\\&amp;P500 and on an option hedging problem. ",
+    "title": "An Euler-based GAN for time series",
+    "authors": [
+      "Carl Remlinger",
+      "Joseph Mickael",
+      "Romuald Elie"
+    ],
+    "emails": [
+      "~Romuald_Elie1",
+      "~Carl_Remlinger1",
+      "edf.fr"
+    ],
+    "rank": 2784
+  },
+  {
+    "url": "https://openreview.net/forum?id=8Y-Y7RVo8vn",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.74",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Recent studies have demonstrated that noise in stochastic gradient descent (SGD) is closely related to generalization; A larger SGD noise, if not too large, results in better generalization. Since the covariance of the SGD noise is proportional to <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>\u03b7</mi><mn>2</mn></msup><mrow><mo>/</mo></mrow><mi>B</mi></math></mjx-assistive-mml></mjx-container>, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b7</mi></math></mjx-assistive-mml></mjx-container> is the learning rate and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>B</mi></math></mjx-assistive-mml></mjx-container> is the minibatch size of SGD, so far the SGD noise has been controlled by changing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b7</mi></math></mjx-assistive-mml></mjx-container> and/or <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>B</mi></math></mjx-assistive-mml></mjx-container>. However, too large <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b7</mi></math></mjx-assistive-mml></mjx-container> results in instability in the training dynamics and a small <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>B</mi></math></mjx-assistive-mml></mjx-container> prevents scalable parallel computation. It is thus desirable to develop a method of controlling the SGD noise without changing <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D702 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b7</mi></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D435 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>B</mi></math></mjx-assistive-mml></mjx-container>. In this paper, we propose a method that achieves this goal using ``noise enhancement'', which is easily implemented in practice. We expound the underlying theoretical idea and demonstrate that the noise enhancement actually improves generalization for real datasets. It turns out that large-batch training with the noise enhancement even shows better generalization compared with small-batch training.\n",
+    "title": "Improved generalization by noise enhancement",
+    "authors": [
+      "Takashi Mori",
+      "Masahito Ueda"
+    ],
+    "emails": [
+      "~Takashi_Mori1",
+      "phys.s.u-tokyo.ac.jp"
+    ],
+    "rank": 2785
+  },
+  {
+    "url": "https://openreview.net/forum?id=j7qEcn647RY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.73",
+    "confidences": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Graph Convolutional Network (GCN) has become one of the most successful methods for graph representation learning. Training and evaluating GCNs on large graphs is challenging since full-batch GCNs have high overhead in memory and computation. In recent years, research communities have been developing stochastic sampling methods to handle large graphs when it is unreal to put the whole graph into a single batch. The performance of the model depends largely on the quality and size of subgraphs in the batch-training. Existing sampling approaches mostly focus on approximating the full-graph structure but care less about redundancy and randomness in sampling subgraphs. To address these issues and explore a better mechanism of producing high-quality subgraphs to train GCNs, we proposed the \\texttt{Linguine} framework where we designed a meta-model to prune the subgraph smartly. To efficiently obtain the meta-model, we designed a joint training scenario with the idea of hardness based learning. The empirical study shows that our method could augment the accuracy of the current state-of-art and reduce the error incurred by the redundancies in the subgraph structure. We also explored the reasoning behind smart pruning via its visualization.",
+    "title": "LINGUINE: LearnIng to pruNe on subGraph convolUtIon NEtworks",
+    "authors": [
+      "Yihan He",
+      "Wei Cao",
+      "Shun Zheng",
+      "Zhifeng Gao",
+      "Jiang Bian"
+    ],
+    "emails": [
+      "~Jiang_Bian1",
+      "~Zhifeng_Gao1",
+      "~Yihan_He1",
+      "~Wei_Cao1",
+      "~Shun_Zheng1"
+    ],
+    "rank": 2786
+  },
+  {
+    "url": "https://openreview.net/forum?id=MkrAyYVmt7b",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Thanks to the tractability of their likelihood, some deep generative models show promise for seemingly straightforward but important applications like anomaly detection, uncertainty estimation, and active learning. However, the likelihood values empirically attributed to anomalies conflict with the expectations these proposed applications suggest. In this paper, we take a closer look at the behavior of distribution densities and show that these quantities carry less meaningful information than previously thought, beyond estimation&nbsp;issues or the curse of dimensionality. We conclude that the use of these likelihoods for out-of-distribution detection relies on strong and implicit hypotheses and highlight the necessity of explicitly formulating these assumptions for reliable anomaly detection.",
+    "title": "Perfect density models cannot guarantee anomaly detection",
+    "authors": [
+      "Charline Le Lan",
+      "Laurent Dinh"
+    ],
+    "emails": [
+      "~Laurent_Dinh1",
+      "~Charline_Le_Lan2"
+    ],
+    "rank": 2787
+  },
+  {
+    "url": "https://openreview.net/forum?id=2234Pp-9ikZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      3
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "State-of-the-art results in deep learning have been improving steadily, in good part due to the use of larger models. However, widespread use is constrained by device hardware limitations, resulting in a substantial performance gap between state-of-the-art models and those that can be effectively deployed on small devices. \n\nWhile Knowledge Distillation (KD) theoretically enables small student models to emulate larger teacher models, in practice selecting a good student architecture requires considerable human expertise. Neural Architecture Search (NAS) appears as a natural solution to this problem but most approaches can be inefficient, as most of the computation is spent comparing architectures sampled from the same distribution, with negligible differences in performance. \n\nIn this paper, we propose to instead search for a family of student architectures sharing the property of being good at learning from a given teacher. \nOur approach AutoKD, powered by Bayesian Optimization, explores a flexible graph-based search space, enabling us to automatically learn the optimal student architecture distribution and KD parameters, while being 20x more sample efficient compared to existing state-of-the-art. We evaluate our method on 3 datasets; on large images specifically, we reach the teacher performance while using 3x less memory and 10x less parameters. Finally, while AutoKD uses the traditional KD loss, it outperforms more advanced KD variants using hand-designed students.",
+    "title": "Don't be picky, all students in the right family can learn from good teachers",
+    "authors": [
+      "Roy Henha Eyono",
+      "Fabio Maria Carlucci",
+      "Pedro M Esperan\u00e7a",
+      "Binxin Ru",
+      "Philip Torr"
+    ],
+    "emails": [
+      "~Fabio_Maria_Carlucci2",
+      "~Pedro_M_Esperan\u00e7a1",
+      "~Binxin_Ru1",
+      "~Roy_Henha_Eyono1",
+      "~Philip_Torr1"
+    ],
+    "rank": 2788
+  },
+  {
+    "url": "https://openreview.net/forum?id=Zqf6RGp5lqf",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Understanding the interactions between novel drugs and target proteins is fundamentally important in disease research as discovering drug-protein interactions can be an exceptionally time-consuming and expensive process. Alternatively, this process can be simulated using modern deep learning methods that have the potential of utilising vast quantities of data to reduce the cost and time required to provide accurate predictions. In this paper, we seek to leverage a set of BERT-style models that have been pre-trained on vast quantities of both protein and drug data. The encodings produced by each model are then utilised as node representations for a graph convolutional neural network, which in turn models the interactions without the need to simultaneously fine-tune both protein and drug BERT models to the task. We evaluate the performance of our approach on two drug-target interaction datasets that were previously used as benchmarks in recent work. Our results significantly improve upon a vanilla BERT baseline approach as well as the former state-of-the-art methods for each task dataset. Our approach builds upon past work in two key areas; firstly, we take full advantage of two large pre-trained BERT models that provide improved representations of task-relevant properties of both drugs and proteins. Secondly, inspired by work in natural language processing that investigates how linguistic structure is represented in such models, we perform interpretability analyses that allow us to locate functionally-relevant areas of interest within each drug and protein. By modelling the drug-target interactions as a graph as opposed to a set of isolated interactions, we demonstrate the benefits of combining large pre-trained models and a graph neural network to make state-of-the-art predictions on drug-target binding affinity.",
+    "title": "Modelling Drug-Target Binding Affinity using a BERT based Graph Neural network",
+    "authors": [
+      "Mark Lennox",
+      "Neil M. Robertson",
+      "Barry Devereux"
+    ],
+    "emails": [
+      "~Neil_M._Robertson1",
+      "~Barry_Devereux1",
+      "~Mark_Lennox1"
+    ],
+    "rank": 2789
+  },
+  {
+    "url": "https://openreview.net/forum?id=X9LHtgR4vq",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Researchers manually compose most neural networks through painstaking experimentation.\nThis process is taxing and explores only a limited subset of possible\narchitecture. Researchers design architectures to address objectives ranging from low\nspace complexity to high accuracy through hours of experimentation. Neural architecture\nsearch (NAS) is a thriving field for automatically discovering architectures\nachieving these same objectives. Addressing these ever-increasing challenges in\ncomputing, we take inspiration from the brain because it has the most efficient\nneuronal wiring of any complex structure; its physiology inspires us to propose\nBractivate, a NAS algorithm inspired by neural dendritic branching. An evolutionary algorithm that adds new skip connection combinations to the most active blocks in the network, propagating salient\ninformation through the network. We apply our methods to lung x-ray, cell nuclei\nmicroscopy, and electron microscopy segmentation tasks to highlight Bractivate's robustness.\nMoreover, our ablation studies emphasize dendritic branching's necessity: ablating\nthese connections leads to significantly lower model performance. We finally\ncompare our discovered architecture with other state-of-the-art UNet models,\nhighlighting how efficient skip connections allow Bractivate to achieve comparable\nresults with substantially lower space and time complexity, proving how\nBractivate balances efficiency with performance. We invite you to work with our\ncode here: \\href{<a href=\"https://tinyurl.com/bractivate}{\\textcolor{violet}{https://tinyurl.com/bractivate}}\" target=\"_blank\" rel=\"nofollow\">https://tinyurl.com/bractivate}{\\textcolor{violet}{https://tinyurl.com/bractivate}}</a>.",
+    "title": "Bractivate: Dendritic Branching in Medical Image Segmentation Neural Architecture Search",
+    "authors": [
+      "Leila Abdelrahman"
+    ],
+    "emails": [
+      "~Leila_Abdelrahman1"
+    ],
+    "rank": 2790
+  },
+  {
+    "url": "https://openreview.net/forum?id=q4HuPoEQK_o",
+    "ratings": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "rating": "3.73",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Quantum Machine Learning (QML) has the potential to significantly advance the state-of-the-art in artificial intelligence, due to recent developments in quantum computing hardware and algorithm design.  Particularly, an avenue opened up by these advances is the possibility of enhancing classical models through developing quantum analogues, which have greater representational power at no extra cost in terms of training and inference.  Here, we investigate analogues of classical networks with stochastic layers, by introducing a class of hybrid stochastic networks that combine layers of several types, including stochastic quantum and classical layers and deterministic classical layers. Further, we introduce Quantum-Annealing (QA)-based sampling techniques that allow such models to be efficiently learned on current QA architectures, using variational and importance-sampling based approaches. Our framework provides benefits in training existing models, including Quantum Boltzmann Machines (QBMs) and Quantum Variational Autoencoders, by allowing local transverse field weights to be optimized jointly with other model parameters, and allows novel hierarchical hybrid models to be learned efficiently. We use classical simulations on synthetic and genomics data to test the impact of including quantum mechanical transverse field terms in such models relative to their classical counterparts. We show that hybrid models are able to achieve better predictive accuracy compared to classical models of matching architecture in these settings, and provide evidence that the local transverse terms can be interpreted as introducing tunable higher-order interactions by connecting genes belonging to common biological pathways.",
+    "title": "Hybrid Quantum-Classical Stochastic Networks with Boltzmann Layers",
+    "authors": [
+      "Jonathan H Warrell",
+      "Prashant Emani",
+      "Mark Gerstein"
+    ],
+    "emails": [
+      "~Jonathan_H_Warrell1",
+      "~Mark_Gerstein1",
+      "yale.edu"
+    ],
+    "rank": 2791
+  },
+  {
+    "url": "https://openreview.net/forum?id=hP-pn8bKffe",
+    "ratings": [
+      4,
+      3,
+      4
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "The paper focuses on the a posteriori tuning of a generative model in order to favor the generation of good instances in the sense of some external differentiable criterion.  The proposed approach, called Boltzmann Tuning of Generative Models (BTGM)  applies to a wide range of applications. It covers conditional generative modelling as a particular case, and offers an affordable alternative to rejection sampling. The contribution of the paper is twofold. Firstly, the objective is formalized and tackled as a well-posed optimization problem; a practical methodology is proposed to choose among the candidate criteria representing the same goal, the one best suited to efficiently learn a tuned generative model. Secondly, the merits of the approach are demonstrated on a real-world application, in the context of robust design for energy policies, showing the ability of BTGM to sample the extreme regions of the considered criteria.",
+    "title": "Boltzman Tuning of Generative Models",
+    "authors": [
+      "Victor Berger",
+      "Michele Sebag"
+    ],
+    "emails": [
+      "~Victor_Berger1",
+      "~Michele_Sebag1"
+    ],
+    "rank": 2792
+  },
+  {
+    "url": "https://openreview.net/forum?id=H-AAaJ9v_lE",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4
+    ],
+    "rating": "3.73",
+    "confidences": [
+      2,
+      5,
+      4
+    ],
+    "abstract": "Various phenomena in biology, physics, and engineering are modeled by differential equations. These differential equations including partial differential equations and ordinary differential equations can be converted and represented as integral equations. In particular, Volterra\u2013Fredholm\u2013Hammerstein integral equations are the main type of these integral equations and researchers are interested in investigating and solving these equations.  In this paper, we propose Legendre Deep Neural Network (LDNN) for solving nonlinear Volterra\u2013Fredholm\u2013Hammerstein integral equations (V-F-H-IEs). LDNN utilizes Legendre orthogonal polynomials as activation functions of the Deep structure. We present how LDNN can be used to solve nonlinear V-F-H-IEs. We show using the Gaussian quadrature collocation method in combination with LDNN results in a novel numerical solution for nonlinear V-F-H-IEs. Several examples are given to verify the performance and accuracy of LDNN.",
+    "title": "Legendre Deep Neural Network (LDNN) and its application for approximation of nonlinear Volterra\u2013Fredholm\u2013Hammerstein integral equations",
+    "authors": [
+      "Kourosh Parand",
+      "Zeinab Hajimohammadi",
+      "Ali Ghodsi"
+    ],
+    "emails": [
+      "sbu.ac.ir",
+      "~Zeinab_Hajimohammadi1",
+      "~Ali_Ghodsi1"
+    ],
+    "rank": 2793
+  },
+  {
+    "url": "https://openreview.net/forum?id=VNpeIc3uc2",
+    "ratings": [
+      2,
+      6,
+      3
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we address two fundamental research questions in neural architecture design: (i) How does the architecture topology impact the gradient flow during training? (ii) Can certain topological characteristics of deep networks indicate a priori (i.e., without training) which models, with a different number of parameters/FLOPS/layers, achieve a similar accuracy? To this end, we formulate the problem of deep learning architecture design from a network science perspective and introduce a new metric called NN-Mass to quantify how effectively information flows through a given architecture. We establish a theoretical link between NN-Mass, a topological property of neural architectures, and gradient flow characteristics (e.g., Layerwise Dynamical Isometry). As such, NN-Mass can identify models with similar accuracy, despite having significantly different size/compute requirements. Detailed experiments on both synthetic and real datasets (e.g., MNIST, CIFAR-10, CIFAR-100, ImageNet) provide extensive evidence for our insights. Finally, we show that the closed-form equation of our theoretically grounded NN-Mass metric enables us to design efficient architectures directly without time-consuming training and searching.",
+    "title": "On the relationship between topology and gradient propagation in deep networks",
+    "authors": [
+      "Kartikeya Bhardwaj",
+      "Guihong Li",
+      "Radu Marculescu"
+    ],
+    "emails": [
+      "~Guihong_Li1",
+      "~Kartikeya_Bhardwaj1",
+      "~Radu_Marculescu2"
+    ],
+    "rank": 2794
+  },
+  {
+    "url": "https://openreview.net/forum?id=4NNQ3l2hbN0",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "In our modern world, an enormous amount of data surrounds us, and we are rarely interested in more than a handful of data points at once. It is like searching for needles in a haystack, and in many cases, there is no better algorithm than a random search, which might not be viable. Previously proposed algorithms for efficient database access are made for particular applications such as finding the min/max, finding all points within a range or finding the k-nearest neighbours. Consequently, there is a lack of versatility concerning what we can search when it comes to a gigantic database. In this work, we propose Search Data Structure Learning (SDSL), a generalization of the standard Search Data Structure (SDS) in which the machine has to learn how to search in the database. To evaluate approaches in this field, we propose a novel metric called Sequential Search Work Ratio (SSWR), a natural way of measuring a search's efficiency and quality. Finally, we inaugurate the field with the Efficient Learnable Binary Access (ELBA), a family of models for Search Data Structure Learning. It requires a means to train two parametric functions and a search data structure for binary codes. For the training, we developed a novel loss function, the F-beta Loss. For the SDS, we describe the Multi-Bernoulli Search (MBS), a novel approach for probabilistic binary codes. Finally, we exhibit the F-beta Loss and the MBS synergy by experimentally showing that it is at least twice as better than using the alternative loss functions of MIHash and HashNet and twenty times better than with another SDS based on the Hamming radius.",
+    "title": "Search Data Structure Learning",
+    "authors": [
+      "Mathieu Duchesneau",
+      "Hansenclever Bassani",
+      "Alain Tapp"
+    ],
+    "emails": [
+      "iro.umontreal.ca",
+      "~Hansenclever_Bassani1",
+      "~Mathieu_Duchesneau1"
+    ],
+    "rank": 2795
+  },
+  {
+    "url": "https://openreview.net/forum?id=RrSuwzJfMQN",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      3,
+      5,
+      5
+    ],
+    "rating": "3.73",
+    "confidences": [
+      4,
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Recent studies have shown that deep neural networks are vulnerable to adversarial examples, but most of the methods proposed to defense adversarial examples cannot solve this problem fundamentally. In this paper, we theoretically prove that there is an upper bound for neural networks with identity mappings to constrain the error caused by adversarial noises. However, in actual computations, this kind of neural network no longer holds any upper bound and is therefore susceptible to adversarial examples. Following similar procedures, we explain why adversarial examples can fool other deep neural networks with skip connections. Furthermore, we demonstrate that a new family of deep neural networks called Neural ODEs (Chen et al., 2018) holds a weaker upper bound. This weaker upper bound prevents the amount of change in the result from being too large. Thus, Neural ODEs have natural robustness against adversarial examples. We evaluate the performance of Neural ODEs compared with ResNet under three white-box adversarial attacks (FGSM, PGD, DI2-FGSM) and one black-box adversarial attack (Boundary Attack). Finally, we show that the natural robustness of Neural ODEs is even better than the robustness of neural networks that are trained with adversarial training methods, such as TRADES and YOPO.\n",
+    "title": "TOWARDS NATURAL ROBUSTNESS AGAINST ADVERSARIAL EXAMPLES",
+    "authors": [
+      "Haoyu Chu",
+      "Shikui Wei",
+      "Yao Zhao"
+    ],
+    "emails": [
+      "~Haoyu_Chu1",
+      "~Shikui_Wei1",
+      "~Yao_Zhao1"
+    ],
+    "rank": 2796
+  },
+  {
+    "url": "https://openreview.net/forum?id=SkUfhuFsvK-",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.72",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Recently, Graph Convolutioal Networks (GCNs) have achieved significant success in many graph-based learning tasks, especially for node classification, due to its excellent ability in representation learning. Nevertheless, it remains challenging for GCN models to obtain satisfying prediction on graphs where few nodes are with known labels. In this paper, we propose a novel self-training algorithm based on GCN to boost semi-supervised node classification on graphs with little supervised information. Inspired by self-supervision strategy, the proposed method introduces an ingenious checking part to add new nodes as supervision after each training epoch to enhance node prediction. In particular, the embedded checking part is designed based on aggregated features, which is more accurate than previous methods and boosts node classification significantly. The proposed algorithm is validated on three public benchmarks in comparison with several state-of-the-art baseline algorithms, and the results illustrate its excellent performance.",
+    "title": "FASG: Feature Aggregation Self-training GCN for Semi-supervised Node Classification",
+    "authors": [
+      "Gongpei Zhao",
+      "Tao Wang",
+      "Yidong Li",
+      "Yi Jin"
+    ],
+    "emails": [
+      "~Tao_Wang1",
+      "~Yi_Jin2",
+      "~Gongpei_Zhao1",
+      "~Yidong_Li1"
+    ],
+    "rank": 2797
+  },
+  {
+    "url": "https://openreview.net/forum?id=KmlvRQo3tC",
+    "ratings": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "rating": "3.72",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Few-shot learning aims to train a classifier given only a few samples per class that are highly insufficient to describe the whole data distribution. These few-shot samples not only introduce high variance to the training but also may include outliers near the class boundaries. Directly feeding these samples to training algorithms can lead to unstable optimization and even incorrect gradient descent direction. In this paper, we improve the robustness to ``outliers'' by learning to propagate and refine the representations of few-shot samples to form a more compact data distribution before using them to train a classifier. We develop a mutual calibration among few-shot samples' representations by graph propagation, for which we learn an attention mechanism to build the graph and determine the propagation weights. On both clean datasets and datasets containing noisy labels, we show that our sample propagation generally improves different types of existing few-shot learning methods in multiple few-shot learning settings.",
+    "title": "MASP: Model-Agnostic Sample Propagation for Few-shot learning",
+    "authors": [
+      "Lu Liu",
+      "Tianyi Zhou",
+      "Guodong Long",
+      "Jing Jiang",
+      "Xuanyi Dong",
+      "Chengqi Zhang"
+    ],
+    "emails": [
+      "~Xuanyi_Dong1",
+      "~Guodong_Long2",
+      "~Tianyi_Zhou1",
+      "~Lu_Liu7",
+      "~Chengqi_Zhang1",
+      "~Jing_Jiang6"
+    ],
+    "rank": 2798
+  },
+  {
+    "url": "https://openreview.net/forum?id=tUNXLHsIx3r",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.72",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Recommendation systems play a decisive role in the choices we make on the internet. They seek to tailor decisions to a user. This makes trust a very important factor in recommendation systems, since it is believed that users are similar to the people that they trust, and hence will make similar choices to those users. Trust and its effects on the choices people make have been widely studied in the context of collaborative recommendation systems. It is understood that trust is not a single-faceted entity but can vary contextually. Recent research in the domain of trust based recommendation systems has shown that taking into account the facets of trust greatly improves the quality of recommendations (Mauro et al., 2019; Fang et al., 2015). We propose a recommendation system that takes multiple facets of trust into account while looking at how suitable a product might be for a particular user. The architecture proposed for this Multi-Faceted Trust Based Recommender (MFTBR) allows for extensibility - new facets of trust can be added without much effort - and dynamicity - trust facets are not weighed arbitrarily. Instead, the weights are optimised for the best result via a neural network. The trust facets considered here are local, global and category-wise trust. MFTBR performs significantly better than basic collaborative filtering - U2UCF (C. Desrosiers, 2011), as well as some established models in the domain of social and trust based recommendation systems - MTR (Mauro et al., 2019) and SocialFD (Yu et al., 2017). Thus, our model provides a better approximation of real-life recommendations, taking into account not only the impact of trust on recommendation, but the context in which trust is established.",
+    "title": "Multi-Faceted Trust Based Recommendation System",
+    "authors": [
+      "Ramamoorthy Srinath",
+      "Shruthi Gopinath",
+      "Vaani Sundaresh",
+      "Varsha Venkatasubramanian"
+    ],
+    "emails": [
+      "~Vaani_Sundaresh1",
+      "gmail.com"
+    ],
+    "rank": 2799
+  },
+  {
+    "url": "https://openreview.net/forum?id=a559vtwlvhh",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.72",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "The AlphaZero algorithm has achieved remarkable success in a variety of sequential, perfect information games including Go, Shogi and Chess. In the original paper the only hyperparameter that is changed from game to game is the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container> parameter governing a search prior. In this paper we investigate the properties of this hyperparameter. First, we build a formal intuition for its behavior on a toy example meant to isolate the influence of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container>. Then, by comparing performance of AlphaZero agents with different <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container> values on Connect 4, we show that the performance of AlphaZero improves considerably with a good choice of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container>. This all highlights the importance of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container> as an interpretable hyperparameter which allows for cross-game tuning that more opaque hyperparameters like model architecture may not.",
+    "title": "Domain Knowledge in Exploration Noise in AlphaZero",
+    "authors": [
+      "Eric Weiner",
+      "George D Monta\u00f1ez",
+      "Aaron Trujillo",
+      "Abtin Molavi"
+    ],
+    "emails": [
+      "hmc.edu",
+      "~Eric_Weiner1"
+    ],
+    "rank": 2800
+  },
+  {
+    "url": "https://openreview.net/forum?id=b-7nwWHFtw",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      5,
+      4
+    ],
+    "rating": "3.72",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Neural network pruning has demonstrated its success in significantly improving the computational efficiency of deep models while only introducing a small reduction on final accuracy. In this paper, we explore an extra bonus of neural network pruning in terms of enhancing privacy. Specifically, we show a novel connection between magnitude-based pruning and adding differentially private noise to intermediate layers under the over-parameterized regime. To the best of our knowledge, this is the first work that bridges pruning with the theory of differential privacy. The paper also presents experimental results by running the model inversion attack on two benchmark datasets, which supports the theoretical finding. ",
+    "title": "Privacy-preserving Learning via Deep Net Pruning",
+    "authors": [
+      "YANGSIBO HUANG",
+      "Xiaoxiao Li",
+      "Yushan Su",
+      "Sachin Ravi",
+      "Zhao Song",
+      "Sanjeev Arora",
+      "Kai Li"
+    ],
+    "emails": [
+      "~Sanjeev_Arora1",
+      "~Xiaoxiao_Li1",
+      "~Kai_Li8",
+      "~Sachin_Ravi1",
+      "princeton.edu",
+      "~YANGSIBO_HUANG1",
+      "~Zhao_Song3"
+    ],
+    "rank": 2801
+  },
+  {
+    "url": "https://openreview.net/forum?id=kki60UTxJQ",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.72",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "A significant obstacle to the development of new image completion models is the lack of a standardized evaluation metric that reflects human judgement. Recent work has proposed the use of human evaluation for image synthesis models, allowing for a reliable method to evaluate the visual quality of generated images. However, there does not yet exist a standardized human evaluation protocol for image completion. In this work, we propose such a protocol. We also provide experimental results of our evaluation method applied to many of the current state-of-the-art generative image models and compare these results to various automated metrics. Our evaluation yields a number of interesting findings. Notably, GAN-based image completion models are outperformed by autoregressive approaches.",
+    "title": "HYPE-C: Evaluating Image Completion Models Through Standardized Crowdsourcing",
+    "authors": [
+      "Emily Walters",
+      "Weifeng Chen",
+      "Jia Deng"
+    ],
+    "emails": [
+      "~Emily_Walters1",
+      "~Weifeng_Chen4",
+      "~Jia_Deng1"
+    ],
+    "rank": 2802
+  },
+  {
+    "url": "https://openreview.net/forum?id=UQz4_jo70Ci",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "rating": "3.72",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Most traditional Siamese trackers are used to regard the location of the max response map as the center of target. However, it is difficult for these traditional methods to calculate response value  accurately when face the similar object, deformation, background clutters and other challenges. So how to get the reliable response map is the key to improve tracking performance. Accordingly, a simple yet effective short-term tracking framework (called SiamCAN),by which bridging the information flow between search branch and template branch, is proposed to solve the above problem in this paper. Moreover, in order to get more accurate target estimation, an anchor-free mechanism and specialized training strategy are applied to narrow the gap between the predicted bounding box and groundtruth. The proposed method achieves state-of-the-art performance on four visual tracking benchmarks including UAV123, OTB100, VOT2018 and VOT2019, outperforming the strong baseline, SiamBAN, by 0.327 <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle displaystyle=\"true\" scriptlevel=\"0\"><mo stretchy=\"false\">\u2192</mo></mstyle></math></mjx-assistive-mml></mjx-container> 0.331 on VOT2019 and 0.631 <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle displaystyle=\"true\" scriptlevel=\"0\"><mo stretchy=\"false\">\u2192</mo></mstyle></math></mjx-assistive-mml></mjx-container> 0.638 success score, 0.833 <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mstyle><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2192\"></mjx-c></mjx-mo></mjx-mstyle></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mstyle displaystyle=\"true\" scriptlevel=\"0\"><mo stretchy=\"false\">\u2192</mo></mstyle></math></mjx-assistive-mml></mjx-container> 0.850 precision score on UAV123.",
+    "title": "SiamCAN:Simple yet Effective Method to enhance Siamese Short-Term Tracking",
+    "authors": [
+      "Yue Zhao",
+      "Zhibin Yu"
+    ],
+    "emails": [
+      "~Zhibin_Yu3",
+      "~Yue_Zhao10"
+    ],
+    "rank": 2803
+  },
+  {
+    "url": "https://openreview.net/forum?id=npkSFg-ktnW",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "rating": "3.72",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Autoencoders perform a powerful information compression framework with are construction loss and can be a regularization module in different tasks, which has no generative ability itself. We wondering if an autoencoder gains generative ability without using GAN and VAE based modification, which are two mature methods.   Here we propose a new method:  Disentanglement and ExplorationAutoencoder (DEAE), DEAE using a disentangle and exploration positive iteration achieve semantic controllable synthesis especially controllable mining the novel semantic value.  For instance, given only red, green, and blue object color while mining new object color expressions in the image domain. The encoder of DEAE first turn the input sample into a disentangled latent code, then explore the latent codespace by attribute oriented interpolation.  To encourage interpolated latent code successfully output a semantically meaningful sample by the decoder, we propose a regularization procedure by \u2019reuse\u2019 encoder and constrain the output latent value which implicitly improves the quality of the interpolated sample. DEAE can become a generative model and synthesis semantic controllable samples by interpolating latent code, which can even synthesis novel attribute value never is shown in the original dataset. Experiments demonstrate how disentanglement and exploration can boost each other which empowers autoencoder generative ability. We also demonstrate that DEAE can improve the performance of downstream tasks compared with GANand VAE based generative model, especially in controllable data augmentation, dataset bias elimination (Fairness)",
+    "title": "Generative Auto-Encoder: Controllable Synthesis with Disentangled Exploration",
+    "authors": [
+      "Yunhao Ge",
+      "Gan Xin",
+      "Zhi Xu",
+      "Yao Xiao",
+      "Yunkui Pang",
+      "Yining HE",
+      "Laurent Itti"
+    ],
+    "emails": [
+      "~Yining_HE1",
+      "~Gan_Xin1",
+      "~Yao_Xiao3",
+      "~Yunkui_Pang1",
+      "~Zhi_Xu2",
+      "~Yunhao_Ge1",
+      "~Laurent_Itti1"
+    ],
+    "rank": 2804
+  },
+  {
+    "url": "https://openreview.net/forum?id=bnuU0PzXl0-",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.71",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "Gender-bias stereotypes have recently raised significant ethical concerns in natural language processing. However, progress in the detection and evaluation of gender-bias in natural language understanding through inference is limited and requires further investigation. In this work, we propose an evaluation methodology to measure these biases by constructing a probe task that involves pairing a gender-neutral premise against a gender-specific hypothesis. We use our probe task to investigate state-of-the-art NLI models on the presence of gender stereotypes using occupations. Our findings suggest that three models (BERT, RoBERTa, and BART) trained on MNLI and SNLI data-sets are significantly prone to gender-induced prediction errors. We also find that debiasing techniques such as augmenting the training dataset to ensure that it is a gender-balanced dataset can help reduce such bias in certain cases.  ",
+    "title": "Evaluating Gender Bias in Natural Language Inference ",
+    "authors": [
+      "Shanya Sharma",
+      "Manan Dey",
+      "Koustuv Sinha"
+    ],
+    "emails": [
+      "~Koustuv_Sinha1",
+      "~Shanya_Sharma1",
+      "~Manan_Dey2"
+    ],
+    "rank": 2805
+  },
+  {
+    "url": "https://openreview.net/forum?id=lFSZySpXXX8",
+    "ratings": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "rating": "3.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "The success of deep supervised learning depends on its automatic data representation abilities. A good representation of high-dimensional complex data should enjoy low-dimensionally and disentanglement while losing as little information as possible. \nIn this work, we give a statistical understanding of how deep representation goals can be achieved with reproducing kernel Hilbert spaces (RKHS)  and generative adversarial networks (GAN). At the population level, we formulate the ideal representation learning task as that of finding a nonlinear map that minimizes the sum of losses characterizing conditional independence (with RKHS)  and disentanglement (with GAN). We estimate the target map at the sample level nonparametrically with deep neural networks. We prove the consistency in terms of the population objective function value. We validate the proposed methods via comprehensive numerical experiments and real data analysis in the context of regression and classification. The resulting prediction accuracies are better than state-of-the-art methods.",
+    "title": "Toward Understanding Supervised Representation Learning with RKHS and GAN",
+    "authors": [
+      "Xu Liao",
+      "Jin Liu",
+      "Tianwen Wen",
+      "Yuling Jiao",
+      "Jian Huang"
+    ],
+    "emails": [
+      "~Jin_Liu5",
+      "xiaomi.com",
+      "u.duke.nus.edu",
+      "~Jian_Huang5",
+      "whu.edu.cn"
+    ],
+    "rank": 2806
+  },
+  {
+    "url": "https://openreview.net/forum?id=XbJiphOWXiU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "rating": "3.71",
+    "confidences": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "abstract": "This paper formulates hypothesis verification as an RL problem. Specifically, we aim to build an agent that, given a hypothesis about the dynamics of the world, can take actions to generate observations which can help predict whether the hypothesis is true or false. Existing RL algorithms fail to solve this task, even for simple environments. \nIn order to train the agents, we exploit the underlying structure of many hypotheses, factorizing them as {pre-condition, action sequence, post-condition} triplets. By leveraging this structure we show that RL agents are able to succeed at the task. Furthermore, subsequent fine-tuning of the policies allows the agent to correctly verify hypotheses not amenable to the above factorization.",
+    "title": "Empirically Verifying Hypotheses Using Reinforcement Learning",
+    "authors": [
+      "Kenneth Marino",
+      "Rob Fergus",
+      "Arthur Szlam",
+      "Abhinav Gupta"
+    ],
+    "emails": [
+      "~Abhinav_Gupta1",
+      "~Arthur_Szlam1",
+      "~Rob_Fergus1",
+      "~Kenneth_Marino1"
+    ],
+    "rank": 2807
+  },
+  {
+    "url": "https://openreview.net/forum?id=Z2_djlm7DmA",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.71",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Knowledge Graph Completion (KGC) mainly devotes to link predicting for an entity pair in Knowledge Graph (KG) according to known facts. In this work, we present a novel model for this end. In this model, Quantum and Translation Embedding are used as components for logical and structural feature capturing in the same vector subspace, respectively. The two components have synergy with each other and achieve impressive performance at low cost which is close to the efficient model TransE. Surprisingly, the performance on challenging datasets such as fb15k237 and WN18RR is up to 94.89% and 92.79% in metric Hits@1 while the dimension of embedding is only 4 in the process of training. The insight of this work enlightens the notion of dense feature model design for KGC which is a new alternative to Deep Neural networks (DNN) in this task or even a better choice.",
+    "title": "Quantum and Translation Embedding for Knowledge Graph Completion",
+    "authors": [
+      "Panfeng Chen",
+      "Yisong Wang",
+      "Renyan Feng",
+      "Xiaomin Yu",
+      "Quan Yu"
+    ],
+    "emails": [
+      "outlook.com",
+      "gzu.edu.cn",
+      "163.com",
+      "~Panfeng_Chen1",
+      "126.com"
+    ],
+    "rank": 2808
+  },
+  {
+    "url": "https://openreview.net/forum?id=UV9kN3S4uTZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      4,
+      2
+    ],
+    "rating": "3.71",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Unsupervised learning of interactions from multi-agent trajectories has broad applications in physics, vision, and robotics. However, existing neural relational inference works are limited to static relations. We consider a more general setting of dynamic relational inference where interactions change over time. We propose  DYnamic multi-Agent Relational Inference (DYARI) model, a deep generative model that can reason about dynamic relations. Using a simulated physics system, we study various dynamic relation scenarios, including periodic and additive dynamics. We perform a comprehensive study on the trade-off between dynamic and inference period, the impact of the training scheme, and model architecture on dynamic relational inference accuracy. We also showcase an application of our model to infer coordination and competition patterns from real-world multi-agent basketball trajectories.",
+    "title": "Dynamic Relational Inference in Multi-Agent Trajectories",
+    "authors": [
+      "Ruichao Xiao",
+      "Manish Kumar Singh",
+      "Rose Yu"
+    ],
+    "emails": [
+      "~Manish_Kumar_Singh1",
+      "~Rose_Yu1",
+      "~Ruichao_Xiao1"
+    ],
+    "rank": 2809
+  },
+  {
+    "url": "https://openreview.net/forum?id=pNDvPXd1qUk",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.71",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Despite the remarkable progress made by the policy gradient algorithms in reinforcement learning (RL), sub-optimal policies usually result from the local exploration property of the policy gradient update. In this work, we propose a method called Zeroth-Order Supervised Policy Improvement (ZOSPI) that exploits the estimated value function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D444 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>Q</mi></math></mjx-assistive-mml></mjx-container> globally while preserving the local exploitation of the policy gradient methods. Experiments show that ZOSPI achieves competitive results on the MuJoCo benchmarks with a remarkable sample efficiency. Moreover, different from the conventional policy gradient methods, the policy learning of ZOSPI is conducted in a self-supervised manner. We show such a self-supervised learning paradigm has the flexibility of including optimistic exploration as well as adopting a non-parametric policy.",
+    "title": "Self-Supervised Continuous Control without Policy Gradient",
+    "authors": [
+      "Hao Sun",
+      "Ziping Xu",
+      "Meng Fang",
+      "Yuhang Song",
+      "Jiechao Xiong",
+      "Bo Dai",
+      "Zhengyou Zhang",
+      "Bolei Zhou"
+    ],
+    "emails": [
+      "~Hao_Sun3",
+      "~Bolei_Zhou5",
+      "~Meng_Fang1",
+      "~Bo_Dai2",
+      "~Zhengyou_Zhang2",
+      "~Yuhang_Song1",
+      "~Jiechao_Xiong1",
+      "umich.edu"
+    ],
+    "rank": 2810
+  },
+  {
+    "url": "https://openreview.net/forum?id=wVYtfckXU0T",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      5,
+      2
+    ],
+    "rating": "3.71",
+    "confidences": [
+      4,
+      3,
+      2,
+      5
+    ],
+    "abstract": "Image animation generates a video of a source image following the motion of a driving video. Self-supervised image animation approaches do not require explicit pose references as inputs, thus offering large flexibility in learning. State-of-the-art self-supervised image animation approaches mostly warp the source image according to the motion of the driving video, and recover the warping artifacts by inpainting. When the source and the driving images have large pose differences, heavy inpainting is necessary. Without guidance, heavily inpainted regions usually suffer from loss of details. While previous data augmentation techniques such as CutMix are effective in regularizing non-warp-based image generation, directly applying them to image animation ignores the difficulty of inpainting on the warped image. We propose PriorityCut, a novel augmentation approach that uses the top-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> percent occluded pixels of the foreground to regularize image animation. By taking into account the difficulty of inpainting, PriorityCut preserves better identity than vanilla CutMix and outperforms state-of-the-art image animation models in terms of the pixel-wise difference, low-level similarity, keypoint distance, and feature embedding distance.",
+    "title": "PriorityCut: Occlusion-aware Regularization for Image Animation",
+    "authors": [
+      "Wai Ting Cheung",
+      "Gyeongsu Chae"
+    ],
+    "emails": [
+      "~Wai_Ting_Cheung1",
+      "~Gyeongsu_Chae1"
+    ],
+    "rank": 2811
+  },
+  {
+    "url": "https://openreview.net/forum?id=LxBFTZT3UOU",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.71",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "   A fundamental challenge in deep learning is that the optimal step sizes for update steps of stochastic gradient descent are unknown. In traditional optimization, line searches are used to determine good step sizes, however, in deep learning, it is too costly to search for good step sizes on the expected empirical loss due to noisy losses. This empirical work shows that it is possible to approximate the expected empirical loss on vertical cross sections for common deep learning tasks considerably cheaply. This is achieved by applying traditional one-dimensional function fitting to measured noisy losses of such cross sections. The step to a minimum of the resulting approximation is then used as step size for the optimization. This approach leads to a robust and straightforward optimization method which performs well across datasets and architectures without the need of hyperparameter tuning.\n",
+    "title": "A straightforward line search approach on the expected empirical loss for stochastic deep learning problems",
+    "authors": [
+      "Maximus Mutschler",
+      "Andreas Zell"
+    ],
+    "emails": [
+      "~Maximus_Mutschler1",
+      "~Andreas_Zell1"
+    ],
+    "rank": 2812
+  },
+  {
+    "url": "https://openreview.net/forum?id=kLyLW3RRqU",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.71",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Neural networks have been shown vulnerable to a variety of adversarial algorithms. A crucial step for understanding the rationale behind this lack of robustness is to assess the potential of the neural networks' representation to encode the existing features. Here, we propose a method to understand the representation quality of the neural networks using a novel test based on Zero-Shot Learning, entitled Raw Zero-Shot. The principal idea is that if an algorithm learns rich features, such features should be able to interpret 'new or unknown' classes as a combination of previously learned features. This is because unknown classes usually share several regular features with recognised (learned) classes, given that the features learned are general enough. We further introduce two metrics to assess this learned representation which interprets unknown classes. One is based on inter-cluster validation technique, while the other is based on the difference in the representation between the case when the class is unknown and the case when it is known to the classifier. Experiments suggest that several adversarial defences not only decrease the attack accuracy of some attacks but also improve the representation quality of the classifiers. Further, a low p-value of the paired-samples t-test suggests that several adversarial defences, in general, change the representation quality significantly. Moreover, experiments also reveal a relationship between the proposed metrics and adversarial attacks (a high Pearson Correlation Coefficient (PCC) and low p-value).",
+    "title": "Representation Quality Of Neural Networks Links To Adversarial Attacks and Defences",
+    "authors": [
+      "Shashank Kotyan",
+      "Moe Matsuki",
+      "Danilo Vasconcellos Vargas"
+    ],
+    "emails": [
+      "~Shashank_Kotyan1",
+      "~Danilo_Vasconcellos_Vargas1",
+      "gmail.com"
+    ],
+    "rank": 2813
+  },
+  {
+    "url": "https://openreview.net/forum?id=JyDnXkeJpjU",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "This paper investigates the use of nonparametric kernel-regression to obtain a task- similarity aware meta-learning algorithm. Our hypothesis is that the use of task- similarity helps meta-learning when the available tasks are limited and may contain outlier/ dissimilar tasks. While existing meta-learning approaches implicitly assume the tasks as being similar, it is generally unclear how this task-similarity could be quantified and used in the learning. As a result, most popular meta- learning approaches do not actively use the similarity/dissimilarity between the tasks, but rely on availability of huge number of tasks for their working. Our contribution is a novel framework for meta-learning that explicitly uses task-similarity in the form of kernels and an associated meta-learning algorithm. We model the task-specific parameters to belong to a reproducing kernel Hilbert space where the kernel function captures the similarity across tasks. The proposed algorithm iteratively learns a meta-parameter which is used to assign a task-specific descriptor for every task. The task descriptors are then used to quantify the task-similarity through the kernel function. We show how our approach conceptually generalizes the popular meta-learning approaches of model-agnostic meta-learning (MAML) and Meta-stochastic gradient descent (Meta-SGD) approaches. Numerical experiments with regression and classification tasks show that our algorithm outperforms these approaches when the number of tasks is limited, even in the presence of out- lier or dissimilar tasks. This supports our hypothesis that task-similarity helps improve the meta-learning performance in task-limited and adverse settings.",
+    "title": "Task-similarity Aware Meta-learning through Nonparametric Kernel Regression",
+    "authors": [
+      "Arun Venkitaraman",
+      "Anders Hansson",
+      "Bo Wahlberg"
+    ],
+    "emails": [
+      "liu.se",
+      "~Arun_Venkitaraman1",
+      "~Bo_Wahlberg1"
+    ],
+    "rank": 2814
+  },
+  {
+    "url": "https://openreview.net/forum?id=fkhl7lb3aw",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "When using machine learning to solve practical tasks, we often face the problem of class imbalance. Unbalanced classes will cause the model to generate preferences during the learning process, thereby ignoring classes with fewer samples. The oversampling algorithm achieves the purpose of balancing the difference in quantity by generating a minority of samples. The quality of the artificial samples determines the impact of the oversampling algorithm on model training. Therefore, a challenge of the oversampling algorithm is how to find a suitable sample generation space. However, too strong conditional constraints can make the generated samples as non-noise points as possible, but at the same time they also limit the search space of the generated samples, which is not conducive to the discovery of better-quality new samples. Therefore, based on this problem, we propose an oversampling algorithm ROGA based on genetic algorithm. Based on random sampling, new samples are gradually generated and the samples that may become noise are filtered out. ROGA can ensure that the sample generation space is as wide as possible, and it can also reduce the noise samples generated. By verifying on multiple datasets, ROGA can achieve a good result.",
+    "title": "ROGA: Random Over-sampling Based on Genetic Algorithm",
+    "authors": [
+      "ZONGDA HAN",
+      "XIUQUAN QIAO",
+      "SHUBO ZHAN"
+    ],
+    "emails": [
+      "cincc.cn",
+      "~ZONGDA_HAN1",
+      "bupt.edu.cn"
+    ],
+    "rank": 2815
+  },
+  {
+    "url": "https://openreview.net/forum?id=bBDlTR5eDIX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "In recent years, the task of video prediction---forecasting future video given past video frames---has attracted attention in the research community. In this paper we propose a novel approach to this problem with Vector Quantized Variational AutoEncoders (VQ-VAE). With VQ-VAE we compress high-resolution videos into a hierarchical set of multi-scale discrete latent variables. Compared to pixels, this compressed latent space has dramatically reduced dimensionality, allowing us to apply scalable autoregressive generative models to predict video. In contrast to previous work that has largely emphasized highly constrained datasets, we focus\non very diverse, large-scale datasets such as Kinetics-600. We predict video at a higher resolution, 256<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-cD7\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u00d7</mo></math></mjx-assistive-mml></mjx-container>256, than any other previous method to our knowledge. We further validate our approach against prior work via a crowdsourced human evaluation.",
+    "title": "Predicting Video with VQVAE",
+    "authors": [
+      "Jacob C Walker",
+      "Ali Razavi",
+      "Aaron van den Oord"
+    ],
+    "emails": [
+      "~Jacob_C_Walker1",
+      "~Aaron_van_den_Oord2",
+      "~Ali_Razavi1"
+    ],
+    "rank": 2816
+  },
+  {
+    "url": "https://openreview.net/forum?id=BIIwfP55pp",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Imitation learning is a natural way for a human to describe a task to an agent, and it can be combined with reinforcement learning to enable the agent to solve that task through exploration. However, traditional methods which combine imitation learning and reinforcement learning require a very large amount of interaction data to learn each new task, even when bootstrapping from a demonstration. One solution to this is to use meta reinforcement learning (meta-RL) to enable an agent to quickly adapt to new tasks at test time. In this work, we introduce a new method to combine imitation learning with meta reinforcement learning, Probabilistic Embeddings for hybrid meta-Reinforcement and Imitation Learning (PERIL). Dual inference strategies allow PERIL to precondition exploration policies on demonstrations, which greatly improves adaptation rates in unseen tasks. In contrast to pure imitation learning, our approach is capable of exploring beyond the demonstration, making it robust to task alterations and uncertainties. By exploiting the flexibility of meta-RL, we show how PERIL is capable of interpolating from within previously learnt dynamics to adapt to unseen tasks, as well as unseen task families, within a set of meta-RL benchmarks under sparse rewards.",
+    "title": "PERIL: Probabilistic Embeddings for hybrid Meta-Reinforcement and Imitation Learning",
+    "authors": [
+      "Alvaro Prat",
+      "Edward Johns"
+    ],
+    "emails": [
+      "~Alvaro_Prat1",
+      "~Edward_Johns1"
+    ],
+    "rank": 2817
+  },
+  {
+    "url": "https://openreview.net/forum?id=cfqJY583gim",
+    "ratings": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Intelligently reasoning about the world often requires integrating data from multiple modalities, as any individual modality may contain unreliable or incomplete information. Prior work in multimodal learning fuses input modalities only after significant independent processing. On the other hand, the brain performs multimodal processing almost immediately. This divide between classical multimodal learning and neuroscience suggests that a detailed study of early multimodal fusion could improve artificial multimodal representations. To facilitate the study of early multimodal fusion, we create a convolutional LSTM network architecture that simultaneously processes both audio and visual inputs, and allows us to select the layer at which audio and visual information combines. Our results demonstrate that immediate fusion of audio and visual inputs in the initial C-LSTM layer results in higher performing networks that are more robust to the addition of white noise in both audio and visual inputs.",
+    "title": "On the Benefits of Early Fusion in Multimodal Representation Learning",
+    "authors": [
+      "Sabera J Talukder",
+      "George Barnum",
+      "Yisong Yue"
+    ],
+    "emails": [
+      "~Sabera_J_Talukder1",
+      "~George_Barnum1",
+      "~Yisong_Yue1"
+    ],
+    "rank": 2818
+  },
+  {
+    "url": "https://openreview.net/forum?id=sv2wC7Amb0s",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      2,
+      3,
+      5,
+      3
+    ],
+    "abstract": "In this paper we propose a conditioning trick, called difference departure from normality, applied on the generator network in response to instability issues during GAN training. We force the generator to get closer to the departure from normality function of real samples computed in the spectral domain of Schur decomposition. This binding makes the generator amenable to truncation and does not limit exploring all the possible modes. We slightly modify the BigGAN architecture incorporating residual network for synthesizing 2D representations of audio signals which enables reconstructing high quality sounds with some preserved phase information. Additionally, the proposed conditional training scenario makes a trade-off between fidelity and variety for the generated spectrograms. The experimental results on UrbanSound8k and ESC-50 environmental sound datasets and the Mozilla common voice dataset have shown that the proposed GAN configuration with the conditioning trick remarkably outperforms baseline architectures, according to three objective metrics: inception score, Frechet inception distance, and signal-to-noise ratio. ",
+    "title": "Conditioning Trick for Training Stable GANs",
+    "authors": [
+      "MOHAMMAD ESMAEILPOUR",
+      "Raymel Alfonso Sallo",
+      "Olivier St-Georges",
+      "Patrick Cardinal",
+      "Alessandro Lameiras Koerich"
+    ],
+    "emails": [
+      "~Patrick_Cardinal1",
+      "~MOHAMMAD_ESMAEILPOUR1",
+      "ens.etsmtl.ca",
+      "etsmtl.ca"
+    ],
+    "rank": 2819
+  },
+  {
+    "url": "https://openreview.net/forum?id=U7-FJu0iE3t",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Classic decision tree learning is a binary classification algorithm that constructs models with first-class transparency - every classification has a directly derivable explanation. However, learning decision trees on modern datasets generates large trees, which in turn generate decision paths of excessive depth, obscuring the explanation of classifications. To improve the comprehensibility of classifications, we propose a new decision tree model that we call Cascading Decision Trees. Cascading Decision Trees shorten the size of explanations of classifications, without sacrificing model performance overall. Our key insight is to separate the notion of a decision path and an explanation path. Utilizing this insight, instead of having one monolithic decision tree, we build several smaller decision subtrees and cascade them in sequence. Our cascading decision subtrees are designed to specifically target explanations for positive classifications. This way each subtree identifies the smallest set of features that can classify as many positive samples as possible, without misclassifying any negative samples. Applying cascading decision trees to new samples results in a significantly shorter and succinct explanation, if one of the subtrees detects a positive classification. In that case, we immediately stop and report the decision path of only the current subtree to the user as an explanation for the classification. We evaluate our algorithm on standard datasets, as well as new real-world applications and find that our model shortens the explanation depth by over 40.8\\% for positive classifications compared to the classic decision tree model.\n",
+    "title": "Succinct Explanations with Cascading Decision Trees",
+    "authors": [
+      "JIALU ZHANG",
+      "Mark Santolucito",
+      "Ruzica Piskac"
+    ],
+    "emails": [
+      "~Ruzica_Piskac1",
+      "~JIALU_ZHANG1",
+      "barnard.edu"
+    ],
+    "rank": 2820
+  },
+  {
+    "url": "https://openreview.net/forum?id=xfOVXyO_cwJ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Uncertainty quantification for complex deep learning models is increasingly important as these techniques see growing use in high-stakes, real-world settings. Currently, the quality of a model's uncertainty is evaluated using point-prediction metrics such as negative log-likelihood or the Brier score on heldout data. In this study, we provide the first large scale evaluation of the empirical frequentist coverage properties of well known uncertainty quantification techniques on a suite of regression and classification tasks. We find that, in general, some methods do achieve desirable coverage properties on \\emph{in distribution} samples, but that coverage is not maintained on out-of-distribution data. Our results demonstrate the failings of current uncertainty quantification techniques as dataset shift increases and establish coverage as an important metric in developing models for real-world applications.",
+    "title": "Empirical Frequentist Coverage of Deep Learning Uncertainty Quantification Procedures",
+    "authors": [
+      "Benjamin Kompa",
+      "Jasper Snoek",
+      "Andrew Beam"
+    ],
+    "emails": [
+      "~Jasper_Snoek1",
+      "~Andrew_Beam1",
+      "~Benjamin_Kompa1"
+    ],
+    "rank": 2821
+  },
+  {
+    "url": "https://openreview.net/forum?id=_QQ_v_w_uNV",
+    "ratings": [
+      3,
+      5,
+      4,
+      3
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      2,
+      5,
+      3
+    ],
+    "abstract": "Deep neural network models are vulnerable to adversarial attacks. In many cases, malicious inputs intentionally crafted for one model can fool another model in the black-box attack setting. However, there is a lack of systematic studies on the transferability of adversarial examples and how to generate universal adversarial examples. In this paper, we systematically study the transferability of adversarial attacks for text classification models. In particular, we conduct extensive experiments to investigate how various factors, such as network architecture, input format, word embedding, and model capacity, affect the transferability of adversarial attacks. Based on these studies, we then propose universal black-box attack algorithms that can induce adversarial examples to attack almost all existing models. These universal adversarial examples reflect the defects of the learning process and the bias in the training dataset. Finally, we generalize these adversarial examples into universal word replacement rules that can be used for model diagnostics. ",
+    "title": "Generating universal language adversarial examples by understanding and enhancing the transferability across neural models",
+    "authors": [
+      "Liping Yuan",
+      "Xiaoqing Zheng",
+      "Yi Zhou",
+      "Cho-Jui Hsieh",
+      "Kai-Wei Chang",
+      "Xuanjing Huang"
+    ],
+    "emails": [
+      "~Cho-Jui_Hsieh1",
+      "~Xiaoqing_Zheng1",
+      "~Xuanjing_Huang1",
+      "~Liping_Yuan1",
+      "~Kai-Wei_Chang1",
+      "~Yi_Zhou11"
+    ],
+    "rank": 2822
+  },
+  {
+    "url": "https://openreview.net/forum?id=NHysmWivrHP",
+    "ratings": [
+      3,
+      4,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Single image depth estimation is a critical issue for robot vision, augmented reality, and many other applications when an image sequence is not available. Self-supervised single image depth estimation models target at predicting accurate disparity map just from one single image without ground truth supervision or stereo image pair during real applications. Compared with direct single image depth estimation, single image stereo algorithm can generate the depth from different camera perspectives. In this paper, we propose a novel architecture to infer accurate disparity by leveraging both spectral-consistency based learning model and view-prediction based stereo reconstruction algorithm. Direct spectral-consistency based method can avoid false positive matching in smooth regions. Single image stereo can preserve more distinct boundaries from another camera perspective. By learning confidence map and designing a fusion strategy, the two disparities from two approaches are able to be effectively fused to produce the refined disparity. Extensive experiments indicate that our method exploits both advantages of spectral consistency and view prediction, especially in constraining boundaries and correcting wrong predicting regions.",
+    "title": "Single Image Depth Estimation Based on Spectral Consistency and Predicted View",
+    "authors": [],
+    "emails": [],
+    "rank": 2823
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ki5Mv0iY8C",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "The intuitive connection to robustness and convincing empirical evidence have made the flatness of the loss surface an attractive measure of generalizability for neural networks. \n     Yet it suffers from various problems such as computational difficulties, reparametrization issues, and a growing concern that it may only be an epiphenomenon of optimization methods. \n     We provide empirical evidence that under the cross-entropy loss once a neural network reaches a non-trivial training error, the flatness correlates (via Pearson Correlation Coefficient) well to the classification margins, which allows us to better reason about the concerns surrounding flatness. \n     Our results lead to the practical recommendation that when assessing generalizability one should consider a margin-based measure instead, as it is computationally more efficient, provides further insight, and is highly correlated to flatness. \n     We also use our insight to replace the misleading folklore that small-batch methods generalize better because they are able to escape sharp minima. Instead, we argue that large-batch methods did not have enough time to maximize margins and hence generalize worse. ",
+    "title": "On Flat Minima, Large Margins and Generalizability",
+    "authors": [
+      "Daniel Lengyel",
+      "Nicholas Jennings",
+      "Panos Parpas",
+      "Nicholas Kantas"
+    ],
+    "emails": [
+      "imperial.ac.uk",
+      "~Daniel_Lengyel1",
+      "~Nicholas_Jennings1",
+      "~Panos_Parpas1"
+    ],
+    "rank": 2824
+  },
+  {
+    "url": "https://openreview.net/forum?id=8q_ca26L1fz",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.69",
+    "confidences": [
+      5,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) have achieved great success in recent years. Three most common applications include node classification, link prediction, and graph classification. While there is rich literature on node classification and graph classification, GNNs for link prediction is relatively less studied and less understood. Two representative classes of methods exist: GAE and SEAL. GAE (Graph Autoencoder) first uses a GNN to learn node embeddings for all nodes, and then aggregates the embeddings of the source and target nodes as their link representation. SEAL extracts a subgraph around the source and target nodes, labels the nodes in the subgraph, and then uses a GNN to learn a link representation from the labeled subgraph. In this paper, we thoroughly discuss the differences between these two classes of methods, and conclude that simply aggregating \\textit{node} embeddings does not lead to effective \\textit{link} representations, while learning from \\textit{properly labeled subgraphs} around links provides highly expressive and generalizable link representations. Experiments on the recent large-scale OGB link prediction datasets show that SEAL has up to 195\\% performance gains over GAE methods, achieving new state-of-the-art results on 3 out of 4 datasets.",
+    "title": "Revisiting Graph Neural Networks for Link Prediction",
+    "authors": [
+      "Muhan Zhang",
+      "Pan Li",
+      "Yinglong Xia",
+      "Kai Wang",
+      "Long Jin"
+    ],
+    "emails": [
+      "fb.com",
+      "~Muhan_Zhang1",
+      "~Pan_Li2"
+    ],
+    "rank": 2825
+  },
+  {
+    "url": "https://openreview.net/forum?id=Gj9aQfQEHRS",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "The Propositional Satisfiability Problem (SAT), and more generally, the Constraint Satisfaction Problem (CSP), are mathematical questions defined as finding an assignment to a set of objects that satisfies a series of constraints. The modern approach is trending to solve CSP through neural symbolic methods. Most recent works are sequential model-based and adopt neural embedding, i.e., reinforcement learning with neural graph networks, and graph recurrent neural networks. This work proposes a one-shot model derived from the eminent Transformer architecture for factor graph structure to solve the CSP problem. We define the heterogeneous attention mechanism based on meta-paths for the self-attention between literals, the cross-attention based on the bipartite graph links from literal to clauses, or vice versa. This model takes advantage of parallelism. Our model achieves high speed and very high accuracy on the factor graph for CSPs with arbitrary size.",
+    "title": "Transformers satisfy",
+    "authors": [
+      "Feng Shi",
+      "CHEN LI",
+      "Shijie Bian",
+      "Yiqiao Jin",
+      "Ziheng Xu",
+      "Tian Han",
+      "Song-Chun Zhu"
+    ],
+    "emails": [
+      "~Yiqiao_Jin1",
+      "~Feng_Shi1",
+      "~Song-Chun_Zhu1",
+      "~CHEN_LI14",
+      "~Ziheng_Xu1",
+      "~Shijie_Bian1",
+      "~Tian_Han1"
+    ],
+    "rank": 2826
+  },
+  {
+    "url": "https://openreview.net/forum?id=tbwjUvUzQRU",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      1,
+      5,
+      5
+    ],
+    "rating": "3.69",
+    "confidences": [
+      3,
+      5,
+      3,
+      2
+    ],
+    "abstract": "A federated kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means algorithm is developed in this paper. This algorithm resolves two challenging issues: 1) how to distributedly solve the optimization problem of kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means under federated settings; 2) how to maintain communication efficiency in the algorithm. To tackle the first challenge, a distributed stochastic proximal gradient descent (DSPGD) algorithm is developed to determine an approximated solution to the optimization problem of kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means. To tackle the second challenge, a communication efficient mechanism (CEM) is designed to reduce the communication cost. Besides, the federated kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means provides two levels of privacy preservation: 1) users\u2019 local data are not exposed to the cloud server; 2) the cloud server cannot recover users\u2019 local data from the local computational results via matrix operations. Theoretical analysis shows: 1) DSPGD with CEM converges with an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D442 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-texatom texclass=\"ORD\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2F\"></mjx-c></mjx-mo></mjx-texatom><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>O</mi><mo stretchy=\"false\">(</mo><mn>1</mn><mrow><mo>/</mo></mrow><mi>T</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> rate, where <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"8\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D447 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>T</mi></math></mjx-assistive-mml></mjx-container> is the number of iterations; 2) the communication cost of DSPGD with CEM is unrelated to the number of data samples; 3) the clustering quality of the federated kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"9\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means approaches that of the standard kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"10\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means, with a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"11\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2B\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"3\"><mjx-c class=\"mjx-c1D716 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><mn>1</mn><mo>+</mo><mi>\u03f5</mi><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> approximate ratio. The experimental results show that the federated kernel <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"12\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means achieves the highest clustering quality with the communication cost reduced by more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"13\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c36\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>60</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> in most cases.",
+    "title": "A Communication Efficient Federated Kernel k-Means",
+    "authors": [
+      "Xiaochen Zhou",
+      "Xudong Wang"
+    ],
+    "emails": [
+      "sjtu.edu.cn",
+      "~Xiaochen_Zhou2"
+    ],
+    "rank": 2827
+  },
+  {
+    "url": "https://openreview.net/forum?id=V0IkICKUptb",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.69",
+    "confidences": [
+      4,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Paper withdrawn.",
+    "title": "Highway-Connection Classifier Networks for Plastic yet Stable Continual Learning",
+    "authors": [
+      "Nicholas I-Hsien Kuo",
+      "Mehrtash Harandi",
+      "Nicolas Fourrier",
+      "Christian Walder",
+      "Gabriela Ferraro",
+      "Hanna Suominen"
+    ],
+    "emails": [
+      "~Nicolas_Fourrier1",
+      "~Hanna_Suominen2",
+      "~Nicholas_I-Hsien_Kuo1",
+      "~Christian_Walder1",
+      "~Mehrtash_Harandi2",
+      "data61.csiro.aut"
+    ],
+    "rank": 2828
+  },
+  {
+    "url": "https://openreview.net/forum?id=uv_x9uAH1MY",
+    "ratings": [
+      2,
+      6,
+      5,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "The development of graph neural networks (GNNs) stimulated the interest in GNNs for NP-hard problems, while most works apply GNNs for NP-hard problems by empirical intuition and experimental trials and improve the results with the help of some heuristic designs. We start from a simple contradiction that a graph coloring problem requires two connected and symmetric nodes to be assigned different colors while an expressively powerful GNN always maps the two nodes to the same node embeddings. To characterize the power of GNNs for the graph coloring problem, we first formalize the discrimination power of GNNs as the capability to assign nodes different colors. We then study a popular class of GNNs, called AC-GNN, and identify the node pairs that AC-GNNs fail to discriminate and provide corresponding solutions to discriminate them. We show that any AC-GNN is a local coloring method and any local coloring method is non-optimal. Moreover, we discuss the color equivalence of graphs and give a condition for the color equivalent AC-GNN theoretically. Following the approaches which prove to enhance the discriminative power, we develop a simple architecture for the graph coloring problem without heuristic pre-processing and post-processing procedures. We empirically validate our theoretical findings and demonstrate that our model is discriminatively powerful and even comparable with state-of-the-art heuristic algorithms.",
+    "title": "Rethinking Graph Neural Networks for Graph Coloring",
+    "authors": [
+      "Wei Li",
+      "Ruxuan Li",
+      "Yuzhe Ma",
+      "Siu On Chan",
+      "Bei Yu"
+    ],
+    "emails": [
+      "~Wei_Li35",
+      "~Bei_Yu2",
+      "link.cuhk.edu.hk",
+      "~Yuzhe_Ma2",
+      "~Siu_On_Chan1"
+    ],
+    "rank": 2829
+  },
+  {
+    "url": "https://openreview.net/forum?id=oXQxan1BWgU",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In meta-learning, the knowledge learned from previous tasks is transferred to new ones, but this transfer only works if tasks are related, and sharing information between unrelated tasks might hurt performance. A fruitful approach is to share gradients across similar tasks during training, and recent work suggests that the gradients themselves can be used as a measure of task similarity.\nWe study the case in which datasets associated to different tasks have a hierarchical, tree structure. While a few methods have been proposed for hierarchical meta-learning in the past, we propose the first algorithm that is model-agnostic, a simple extension of MAML. As in MAML, our algorithm adapts the model to each task with a few gradient steps, but the adaptation follows the tree structure: in each step, gradients are pooled across task clusters, and subsequent steps follow down the tree. We test the algorithm on linear and non-linear regression on synthetic data, and show that the algorithm significantly improves over MAML. Interestingly, the algorithm performs best when it does not know in advance the tree structure of the data.",
+    "title": "Model agnostic meta-learning on trees",
+    "authors": [
+      "Jezabel Garcia",
+      "Federica Freddi",
+      "Jamie McGowan",
+      "Tim Nieradzik",
+      "Da-shan Shiu",
+      "Ye Tian",
+      "Alberto Bernacchia"
+    ],
+    "emails": [
+      "nieradzik.me",
+      "ucl.ac.uk",
+      "mtkresearch.com",
+      "gmail.com",
+      "~Alberto_Bernacchia1"
+    ],
+    "rank": 2830
+  },
+  {
+    "url": "https://openreview.net/forum?id=5aDnCA_RXS",
+    "ratings": [
+      5,
+      4,
+      2,
+      4
+    ],
+    "rating": "3.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Determining what kind of representations neural networks learn, and how this may relate to generalization, remains a challenging problem. Previous work has utilized a rich set of methods to invert layer representations of neural networks, i.e. given some reference activation <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-n\" noic=\"true\"><mjx-c class=\"mjx-c3A6\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi mathvariant=\"normal\">\u03a6</mi><mn>0</mn></msub></math></mjx-assistive-mml></mjx-container> and a layer function <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>r</mi><mrow><mi>\u2113</mi></mrow></msub></math></mjx-assistive-mml></mjx-container>, find <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi></math></mjx-assistive-mml></mjx-container> which minimizes <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-n\" noic=\"true\"><mjx-c class=\"mjx-c3A6\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c30\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\" space=\"3\"><mjx-c class=\"mjx-c2212\"></mjx-c></mjx-mo><mjx-msub space=\"3\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c2113\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-msup><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c7C\"></mjx-c></mjx-mo><mjx-script style=\"vertical-align: 0.363em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msub><mi mathvariant=\"normal\">\u03a6</mi><mn>0</mn></msub><mo>\u2212</mo><msub><mi>r</mi><mrow><mi>\u2113</mi></mrow></msub><mo stretchy=\"false\">(</mo><mi>x</mi><mo stretchy=\"false\">)</mo><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><msup><mo data-mjx-texclass=\"ORD\" stretchy=\"false\">|</mo><mn>2</mn></msup></math></mjx-assistive-mml></mjx-container> . We show that neural networks can preserve invertibility across several iterations. That is, it is possible to interpret activations produced in some later iteration in the context of the layer function of the current iteration. For convolutional and fully connected networks, the lower layers maintain such a consistent representation for several iterations, while in the higher layers invertibility holds for fewer iterations. Adding skip connections such as those found in Resnet allows even higher layers to preserve invertibility across several iterations. We believe the fact that higher layers may interpret weight changes made by lower layers as changes to the data may produce implicit data augmentation. This implicit data augmentation may eventually yield some insight into why neural networks can generalize even with so many parameters.",
+    "title": "Neural Networks Preserve Invertibility Across Iterations: A Possible Source of Implicit Data Augmentation",
+    "authors": [
+      "Arushi Gupta"
+    ],
+    "emails": [
+      "~Arushi_Gupta1"
+    ],
+    "rank": 2831
+  },
+  {
+    "url": "https://openreview.net/forum?id=0p-aRvcVs-U",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Multitask Learning is a Machine Learning paradigm that aims to train a range of (usually related) tasks with the help of a shared model. While the goal is often to improve the joint performance of all training tasks, another approach is to focus on the performance of a specific target task, while treating the remaining ones as auxiliary data from which to possibly leverage positive transfer towards the target during training. In such settings, it becomes important to estimate the positive or negative influence auxiliary tasks will have on the target. While many ways have been proposed to estimate task weights before or during training they typically rely on heuristics or extensive search of the weighting space. We propose a novel method called <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container>-Variable Importance Learning (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container>VIL) that is able to adjust task weights dynamically during model training, by making direct use of task-specific updates of the underlying model's parameters between training epochs. Experiments indicate that <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D6FC TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b1</mi></math></mjx-assistive-mml></mjx-container>VIL is able to outperform other Multitask Learning approaches in a variety of settings. To our knowledge, this is the first attempt at making direct use of model updates for task weight estimation.",
+    "title": "\u03b1VIL: Learning to Leverage Auxiliary Tasks for Multitask Learning",
+    "authors": [
+      "Rafael Kourdis",
+      "Gabriel Gordon-Hall",
+      "Philip John Gorinski"
+    ],
+    "emails": [
+      "~Philip_John_Gorinski1",
+      "gmail.com"
+    ],
+    "rank": 2832
+  },
+  {
+    "url": "https://openreview.net/forum?id=8m_XkdqjZAr",
+    "ratings": [
+      3,
+      4,
+      4
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Real-world binary classification tasks are in many cases unbalanced i.e. the minority class is much smaller than the majority class. This skewness is challenging for machine learning algorithms as they tend to focus on the majority and greatly misclassify the minority. Oversampling the minority using \\emph{SMOTE} before training the model is a popular method to address this challenge. Inspired by \\emph{SMOTE}, we propose \\emph{AE-SMOTE}, which by using an autoencoder, (1) maps the features to a dense continuous latent space, (2) applies oversampling by interpolation in the latent space, and (3) maps the synthetic samples back to the original feature space. While \\emph{SMOTE} supports discrete (categorical) features, almost all variants and extensions of \\emph{SMOTE} do not. Wrapping any one of these \\emph{SMOTE} variants with an autoencoder will enable it to support multi-modal datasets that include discrete features. We have empirically shown the effectiveness of the proposed approach on 35 publicly available datasets.",
+    "title": "AE-SMOTE: A Multi-Modal Minority Oversampling Framework",
+    "authors": [
+      "Sajad Darabi",
+      "Yotam Elor"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Sajad_Darabi1"
+    ],
+    "rank": 2833
+  },
+  {
+    "url": "https://openreview.net/forum?id=DGttsPh502x",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      3,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Language generation models are attracting more and more attention due to their constantly increasing quality and remarkable generation results. State-of-the-art NLG models like BART/T5/GPT-3 do not have latent spaces, therefore there is no natural way to perform controlled generation. In contrast, less popular models with explicit latent spaces have the innate ability to manipulate text attributes by moving along latent directions. For images, properties of latent spaces are well-studied: there exist interpretable directions (e.g. zooming, aging, background removal) and they can even be found without supervision. This success is expected: latent space image models, especially GANs, achieve state-of-the-art generation results and hence have been the focus of the research community. For language, this is not the case: text GANs are hard to train because of non-differentiable discrete data generation, and language VAEs suffer from posterior collapse and fill the latent space poorly. This makes finding interpetable text controls challenging. In this work, we make the first step towards unsupervised discovery of interpretable directions in language latent spaces. For this, we turn to methods shown to work in the image domain. Surprisingly, we find that running PCA on VAE representations of training data consistently outperforms shifts along the coordinate and random directions. This approach is simple, data-adaptive, does not require training and discovers meaningful directions, e.g. sentence length, subject age, and verb tense. Our work lays foundations for two important areas: first, it allows to compare models in terms of latent space interpretability, and second, it provides a baseline for unsupervised latent controls discovery.",
+    "title": "Unsupervised Discovery of Interpretable Latent Manipulations in Language VAEs",
+    "authors": [
+      "Max Ryabinin",
+      "Artem Babenko",
+      "Elena Voita"
+    ],
+    "emails": [
+      "~Artem_Babenko1",
+      "~Max_Ryabinin1",
+      "~Elena_Voita1"
+    ],
+    "rank": 2834
+  },
+  {
+    "url": "https://openreview.net/forum?id=PcBVjfeLODY",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The representation of products in a e-commerce marketplace is a key aspect to be exploited when trying to improve the user experience on the site.  A well known example of the importance of a good product representation are tasks such as product search or product recommendation. There is however a multitude of lesser known tasks relevant to the business, examples are the detection of counterfeit items, the estimation of package sizes or the categorization of products, among others. It is in this setting that good vector representations of products that can be reused on different tasks are very valuable. Past years have seen a major increase in research in the area of latent representations for products in e-Commerce. Examples of this are models like Prod2Vec or Meta-Prod2Vec which leverage from the information of a user session in order to generate vectors of the products that can be used in product recommendations. This work proposes a novel deep encoder model for learning product embeddings to be applied in several downstream tasks. The model uses pairs of products that appear together in a browsing session of the users and adds a proximity constraint to the final latent space in order to project the embeddings of similar products close to each other. This has a regularization effect which gives better features representations to use across multiple downstream tasks, we explore such effect in our experimentation by assessing its impact on the performance of the tasks. Our experiments show effectiveness in transfer learning scenarios comparable to several industrial baselines.",
+    "title": "Constraining Latent Space to Improve Deep Self-Supervised e-Commerce Products Embeddings for Downstream Tasks",
+    "authors": [
+      "Cristian Cardellino",
+      "Rafael Carrascosa"
+    ],
+    "emails": [
+      "mercadolibre.com",
+      "~Cristian_Cardellino1"
+    ],
+    "rank": 2835
+  },
+  {
+    "url": "https://openreview.net/forum?id=oweBPxtma_i",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      3,
+      2,
+      4
+    ],
+    "abstract": "Recently, for finding inherent causality implied in CNN, the black box problem of its discrimination part, which is composed of all fully connected layers of the CNN, has been studied by different scientific communities. Many methods were proposed, which can extract various interpretable models from the optimal discrimination part based on inputs and outputs of the part for finding the inherent causality implied in the part. However, the inherent causality cannot readily be found. We think that the problem could be solved by shrinking an interpretable distance which can evaluate the degree for the discrimination part to be easily explained by an interpretable model. This paper proposes a lightweight interpretable model, Deep Cognitive Learning Model(DCLM). And then, a game method between the DCLM and the discrimination part is implemented for shrinking the interpretation distance. Finally, the proposed self-explanatory method was evaluated by some contrastive experiments with certain baseline methods on some standard image processing benchmarks. These experiments indicate that the proposed method can effectively find the inherent causality implied in the discrimination part of the CNN without largely reducing its generalization performance. Moreover, the generalization performance of the DCLM also can be improved.",
+    "title": "A self-explanatory method for the black box problem on discrimination part of CNN",
+    "authors": [
+      "Jinwei Zhao",
+      "Qizhou Wang",
+      "Wanli Qiu",
+      "Guo Xie",
+      "Wei Wang",
+      "Xinhong Hei",
+      "Deyu Meng"
+    ],
+    "emails": [
+      "xaut.edu.cn",
+      "qq.com",
+      "~Deyu_Meng1",
+      "foxmail.com",
+      "~Jinwei_Zhao1"
+    ],
+    "rank": 2836
+  },
+  {
+    "url": "https://openreview.net/forum?id=OCRKCul3eKN",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Reinforcement learning (RL) encompasses both online and offline regimes.  Unlike its online counterpart, offline RL agents are trained using logged-data only, without interaction with the environment.  Therefore, offline RL is a promising direction for real-world applications, such as healthcare, where repeated interaction with environments is prohibitive.  However, since offline RL losses often involve evaluating state-action pairs not well-covered by training data, they can suffer due to the errors introduced when the function approximator attempts to extrapolate those pairs' value. These errors can be compounded by bootstrapping when the function approximator overestimates, leading the value function to *grow unbounded*, thereby crippling learning. In this paper, we introduce a three-part solution to combat extrapolation errors: (i) behavior value estimation, (ii) ranking regularization, and (iii) reparametrization of the value function.  We provide ample empirical evidence on the effectiveness of our method, showing state of the art performance on the RL Unplugged (RLU) ATARI dataset. Furthermore, we introduce new datasets for bsuite as well as partially observable DeepMind Lab environments, on which our method outperforms state of the art offline RL algorithms. \n",
+    "title": "Addressing Extrapolation Error in Deep Offline Reinforcement Learning",
+    "authors": [
+      "Caglar Gulcehre",
+      "Sergio G\u00f3mez Colmenarejo",
+      "ziyu wang",
+      "Jakub Sygnowski",
+      "Thomas Paine",
+      "Konrad Zolna",
+      "Yutian Chen",
+      "Matthew Hoffman",
+      "Razvan Pascanu",
+      "Nando de Freitas"
+    ],
+    "emails": [
+      "~Yutian_Chen1",
+      "~Konrad_Zolna1",
+      "~Thomas_Paine1",
+      "~Matthew_Hoffman1",
+      "~Sergio_G\u00f3mez_Colmenarejo1",
+      "~Jakub_Sygnowski1",
+      "~Razvan_Pascanu1",
+      "~Nando_de_Freitas1",
+      "~Caglar_Gulcehre1",
+      "~ziyu_wang1"
+    ],
+    "rank": 2837
+  },
+  {
+    "url": "https://openreview.net/forum?id=UxEmUjQujER",
+    "ratings": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "As the success of Generative Adversarial Networks (GANs) on natural images quickly propels them into various real-life applications across different domains, it becomes more and more important to clearly understand their limitations. Specifically, understanding GANs' capability across the full spectrum of spatial frequencies, i.e. beyond the low-frequency dominant spectrum of natural images, is critical for assessing the reliability of GAN generated data in any detail-sensitive application (e.g. denoising, filling and super-resolution in medical and satellite images). In this paper, we show that the ability of convolutional GANs to learn a distribution is significantly affected by the spatial frequency of the underlying carrier signal, that is, GANs have a bias against learning high spatial frequencies. Crucially, we show that this bias is not merely a result of the scarcity of high frequencies in natural images, rather, it is a systemic bias hindering the learning of high frequencies regardless of their prominence in a dataset. Furthermore, we explain why large-scale GANs' ability to generate fine details on natural images does not exclude them from the adverse effects of this bias. Finally, we propose a method for manipulating this bias with minimal computational overhead. This method can be used to explicitly direct computational resources towards any specific spatial frequency of interest in a dataset, extending the flexibility of GANs.",
+    "title": "Spatial Frequency Bias in Convolutional Generative Adversarial Networks",
+    "authors": [
+      "Mahyar Khayatkhoei",
+      "Ahmed Elgammal"
+    ],
+    "emails": [
+      "~Mahyar_Khayatkhoei1",
+      "~Ahmed_Elgammal1"
+    ],
+    "rank": 2838
+  },
+  {
+    "url": "https://openreview.net/forum?id=KfRtxjqU-Hd",
+    "ratings": [
+      4,
+      3,
+      4
+    ],
+    "rating": "3.67",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "While there exists a wide variety of graph neural networks (GNN) for node classification, only a minority of them adopt effective mechanisms to propagate the nodes' information with respect to these nodes' global importance. Additionally, two very important challenges that still significantly affect graph neural networks are the over-fitting and over-smoothing issues. Essentially, both issues cause poor generalization of the model and much poorer node classification performance. In this paper we propose the NODE-SELECT graph neural network (NSGNN): a novel and flexible graph neural network that uses subsetting filters to learn the contribution from the nodes selected to share their information. For the selected nodes, the way their learned information propagates resembles that of actual networks of the real world; where only a subset of nodes simultaneously share information. With the ability to manipulate the message passing operations through the use of numerous ensembled filters, our NODE-SELECT graph neural network is able to address the over-fitting problem and by-pass the over-smoothing challenge for graph neural networks. Furthermore, we also propose an efficient and informative measure named MICS to quantify the over-smoothing problem. Our NODE-SELECT achieved or matched state-of-the art results in a number of transductive experiments over different benchmark datasets.",
+    "title": "NODE-SELECT: A FLEXIBLE GRAPH NEURAL NETWORK BASED ON REALISTIC PROPAGATION SCHEME",
+    "authors": [
+      "Steph-Yves Louis",
+      "Alireza Nasiri",
+      "Fatima Christina Rolland",
+      "Cameron Mitro",
+      "Jianjun Hu"
+    ],
+    "emails": [
+      "~Alireza_Nasiri1",
+      "~Steph-Yves_Louis1",
+      "drexel.edu",
+      "~Jianjun_Hu1",
+      "atriumhealth.org"
+    ],
+    "rank": 2839
+  },
+  {
+    "url": "https://openreview.net/forum?id=3l4Dlrgm92Q",
+    "ratings": [
+      3,
+      3,
+      5
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Backdoor attack against deep neural networks is currently being profoundly investigated due to its severe security consequences. Current state-of-the-art backdoor attacks require the adversary to modify the input, usually by adding a trigger to it, for the target model to activate the backdoor. This added trigger not only increases the difficulty of launching the backdoor attack in the physical world, but also can be easily detected by multiple defense mechanisms.  In this paper, we present the first triggerless backdoor attack against deep neural networks, where the adversary does not need to modify the input for triggering the backdoor. Our attack is based on the dropout technique. Concretely, we associate a set of target neurons that are dropped out during model training with the target label. In the prediction phase, the model will output the target label when the target neurons are dropped again, i.e., the backdoor attack is launched. This triggerless feature of our attack makes it practical in the physical world. Extensive experiments show that our triggerless backdoor attack achieves a perfect attack success rate with a negligible damage to the model's utility.",
+    "title": "Don't Trigger Me! A Triggerless Backdoor Attack Against Deep Neural Networks",
+    "authors": [
+      "Ahmed Salem",
+      "Michael Backes",
+      "Yang Zhang"
+    ],
+    "emails": [
+      "~Ahmed_Salem2",
+      "~Yang_Zhang15",
+      "~Michael_Backes1"
+    ],
+    "rank": 2840
+  },
+  {
+    "url": "https://openreview.net/forum?id=IrofNLZuWF",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We investigate stochastic optimization under weaker assumptions on the distribution of noise  than those used in usual analysis. Our assumptions are motivated by empirical observations in training neural networks. In particular, standard results on optimal convergence rates for stochastic optimization assume either there exists a uniform bound on the moments of the gradient noise, or that the noise decays as the algorithm progresses. These assumptions do not match the empirical behavior of optimization algorithms used in neural network training where the noise level in stochastic gradients could even increase with time. We address this nonstationary behavior of noise by analyzing convergence rates of stochastic gradient methods subject to changing second moment (or variance) of the stochastic oracle. When the noise variation is known, we show that it is always beneficial to adapt the step-size and exploit the noise variability. When the noise statistics are unknown, we obtain similar improvements by developing an online estimator of the noise level, thereby recovering close variants of RMSProp~\\citep{tieleman2012lecture}. Consequently, our results reveal why adaptive step size methods can outperform SGD, while still enjoying theoretical guarantees.",
+    "title": "Stochastic Optimization with Non-stationary Noise: The Power of Moment Estimation",
+    "authors": [
+      "Jingzhao Zhang",
+      "Hongzhou Lin",
+      "Subhro Das",
+      "Suvrit Sra",
+      "Ali Jadbabaie"
+    ],
+    "emails": [
+      "~Suvrit_Sra1",
+      "~Ali_Jadbabaie1",
+      "~Jingzhao_Zhang2",
+      "~Subhro_Das1",
+      "~Hongzhou_Lin1"
+    ],
+    "rank": 2841
+  },
+  {
+    "url": "https://openreview.net/forum?id=rAIkhjUK0Tx",
+    "ratings": [
+      5,
+      4,
+      4,
+      2
+    ],
+    "rating": "3.67",
+    "confidences": [
+      3,
+      4,
+      4,
+      4
+    ],
+    "abstract": "This paper presents a novel multi-step reinforcement learning algorithms, named Greedy Multi-Step Value Iteration (GM-VI), under off-policy setting. GM-VI iteratively approximates the optimal value function of a given environment using a newly proposed multi-step bootstrapping technique, in which the step size is adaptively adjusted along each trajectory according to a greedy principle. With the improved multi-step information propagation mechanism, we show that the resulted VI process is capable of safely learning from arbitrary behavior policy without additional off-policy correction. We further analyze the theoretical properties of the corresponding operator, showing that it is able to converge to globally optimal value function, with a rate faster than traditional Bellman Optimality Operator. Experiments reveal that the proposed methods is reliable, easy to implement and achieves state-of-the-art performance on a series of standard benchmark datasets.",
+    "title": "Greedy Multi-Step Off-Policy Reinforcement Learning",
+    "authors": [
+      "Yuhui Wang",
+      "Xiaoyang Tan"
+    ],
+    "emails": [
+      "~Xiaoyang_Tan2",
+      "~Yuhui_Wang1"
+    ],
+    "rank": 2842
+  },
+  {
+    "url": "https://openreview.net/forum?id=pXi-zY262sE",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "We introduce a training method for better word representation and performance, which we call \\textbf{GraVeR} (\\textbf{Gra}dual \\textbf{Ve}ctor \\textbf{R}umination). The method is to gradually and iteratively add random noises and bias to word embeddings after training a model, and re-train the model from scratch but initialize with the noised word embeddings. Through the re-training process, some noises can be compensated and other noises can be utilized to learn better representations. As a result, we can get word representations further fine-tuned and specialized in the task. On six text classification tasks, our method improves model performances with a large gap. When GraVeR is combined with other regularization techniques, it shows further improvements. Lastly, we investigate the usefulness of GraVeR.",
+    "title": "Ruminating Word Representations with Random Noise Masking",
+    "authors": [
+      "Hwiyeol Jo",
+      "Byoung-Tak Zhang"
+    ],
+    "emails": [
+      "~Byoung-Tak_Zhang1",
+      "~Hwiyeol_Jo1"
+    ],
+    "rank": 2843
+  },
+  {
+    "url": "https://openreview.net/forum?id=cef_G2hkiGc",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      2,
+      6,
+      4
+    ],
+    "rating": "3.67",
+    "confidences": [
+      4,
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Pruning of neural networks, also known as compression or sparsification, is the task of converting a given network, which may be too expensive to use (in prediction) on low resource platforms, with another 'lean' network which performs almost as well as the original one, while using considerably fewer resources.  By turning the compression ratio knob, the practitioner can trade off the information gain versus the necessary computational resources, where information gain is a measure of reduction of uncertainty in the prediction.\n\nIn certain cases, however, the practitioner may readily possess some information on the prediction from other sources.  The main question we study here is, whether it is possible to take advantage of the additional side information, in order to further reduce the computational resources, in tandem with the pruning process?\n\nMotivated by a real-world application, we distill the following elegantly stated problem.  We are given a multi-class prediction problem, combined with a (possibly pre-trained) network architecture for solving it on a given instance distribution, and also a method for pruning the network to allow trading off prediction speed with accuracy.  We assume the network and the pruning methods are state-of-the-art, and it is not our goal here to improve them.  However, instead of being asked to predict a single drawn instance <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>x</mi></math></mjx-assistive-mml></mjx-container>, we are being asked to predict the label of an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>-tuple of instances <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2026\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo stretchy=\"false\">(</mo><msub><mi>x</mi><mn>1</mn></msub><mo>,</mo><mo>\u2026</mo><msub><mi>x</mi><mi>n</mi></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container>, with the additional side information of all tuple instances share the same label.  The shared label distribution is identical to the distribution on which the network was trained.\n\nOne trivial way to do this is by obtaining individual raw predictions for each of the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> instances (separately), using our given network, pruned for a desired accuracy, then taking the average to obtain a single more accurate prediction. This is simple to implement but intuitively sub-optimal, because the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> independent instantiations of the network do not share any information, and would probably waste resources on overlapping computation.\n\nWe propose various methods for performing this task, and compare them using extensive experiments on public benchmark data sets for image classification. Our comparison is based on measures of relative information (RI) and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container>-accuracy, which we define. Interestingly, we empirically find that I) sharing information between the <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"6\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>n</mi></math></mjx-assistive-mml></mjx-container> independently computed hidden representations of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"7\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D465 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>x</mi><mn>1</mn></msub><mo>,</mo><mo>.</mo><mo>.</mo><mo>,</mo><msub><mi>x</mi><mi>n</mi></msub></math></mjx-assistive-mml></mjx-container>, using an LSTM based gadget, performs best, among all methods we experiment with, ii) for all methods studied, we exhibit a sweet spot phenomenon, which sheds light on the compression-information trade-off and may assist a practitioner to choose the desired compression ratio.",
+    "title": "More Side Information, Better Pruning: Shared-Label Classification as a Case Study",
+    "authors": [
+      "Omer Leibovitch",
+      "Nir Ailon"
+    ],
+    "emails": [
+      "~Omer_Leibovitch1",
+      "~Nir_Ailon1"
+    ],
+    "rank": 2844
+  },
+  {
+    "url": "https://openreview.net/forum?id=OEgDatKuz2O",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      5,
+      3
+    ],
+    "rating": "3.65",
+    "confidences": [
+      3,
+      5,
+      4,
+      5
+    ],
+    "abstract": "We propose an unsupervised domain adaptation approach based on generative models. We show that when the source probability density function can be learned, one-step Expectation\u2013Maximization iteration plus an additional marginal density function constraint will produce a proper mediator probability density function to bridge the gap between the source and target domains. The breakthrough is based on modern generative models (autoregressive mixture density nets) that are competitive to discriminative models on moderate-dimensional classification problems. By decoupling the source density estimation from the adaption steps, we can design a domain adaptation approach where the source data is locked away after being processed only once, opening the door to transfer when data security or privacy concerns impede the use of traditional domain adaptation. We demonstrate that our approach can achieve state-of-the-art performance on synthetic and real data sets, without accessing the source data at the adaptation phase.",
+    "title": "EMTL: A Generative Domain Adaptation Approach",
+    "authors": [
+      "Jianfeng Zhang",
+      "Illyyne Saffar",
+      "Aladin Virmaux",
+      "Bal\u00e1zs K\u00e9gl"
+    ],
+    "emails": [
+      "~Jianfeng_Zhang2",
+      "~Illyyne_Saffar1",
+      "~Bal\u00e1zs_K\u00e9gl2",
+      "~Aladin_Virmaux1"
+    ],
+    "rank": 2845
+  },
+  {
+    "url": "https://openreview.net/forum?id=3NemFmEq9jA",
+    "ratings": [
+      5,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.64",
+    "confidences": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "abstract": "We introduce two temporal attention modules which can be plugged into traditional memory augmented recurrent neural networks to improve their performance in natural language processing tasks.\nThe temporal attention modules provide new inductive biases allowing the models to compute attention distributions over the different time steps of input sequences.\nThe values of these attention distributions can be inspected to identify the sequence's elements that the model considered relevant during the inference.\nUsing the Entity Network (Henaff et al., 2016) as the model backbone, experiments were made on the dataset bAbI tasks, a set of QA tasks.\nDue to the addition of the temporal attention modules, the performance metric increased 26% when the temporal attention was supervised, and 13,5% when it wasn't.\nMoreover, the usage of temporal attention modules proved useful at resolving reasoning tasks that the original model was unable to solve.",
+    "title": "Temporal Attention Modules for Memory-Augmented Neural Networks",
+    "authors": [
+      "Rodolfo Palma",
+      "Alvaro Soto",
+      "Luis Mart\u00ed",
+      "Nayat Sanchez-pi"
+    ],
+    "emails": [
+      "~Alvaro_Soto1",
+      "~Nayat_Sanchez-pi1",
+      "~Rodolfo_Palma1",
+      "~Luis_Mart\u00ed1"
+    ],
+    "rank": 2846
+  },
+  {
+    "url": "https://openreview.net/forum?id=iG_Cg6ONjX",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.64",
+    "confidences": [
+      3,
+      2,
+      2,
+      4
+    ],
+    "abstract": "The expressiveness of deep neural network (DNN) is a perspective to understand the surprising performance of DNN. The number of linear regions, i.e. pieces that a piece-wise-linear function represented by a DNN, is generally used to measure the expressiveness. And the upper bound of regions number partitioned by a rectifier network, instead of the number itself, is a more practical measurement of expressiveness of a rectifier DNN. In this work, we propose a new and tighter upper bound of regions number. Inspired by the proof of this upper bound and the framework of matrix computation in \\citet{hinz2019framework}, we propose a general computational approach to compute a tight upper bound of regions number for theoretically any network structures (e.g. DNN with all kind of skip connections and residual structures). Our experiments show our upper bound is tighter than existing ones, and explain why skip connections and residual structures can improve network performance.",
+    "title": "A General Computational Framework to Measure the Expressiveness of Complex Networks using a Tight Upper Bound of Linear Regions",
+    "authors": [
+      "Yutong Xie",
+      "Gaoxiang Chen",
+      "Quanzheng Li"
+    ],
+    "emails": [
+      "~Quanzheng_Li2",
+      "~Yutong_Xie1",
+      "pku.edu.cn"
+    ],
+    "rank": 2847
+  },
+  {
+    "url": "https://openreview.net/forum?id=8-sxWOto_iI",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      4,
+      2,
+      4
+    ],
+    "rating": "3.64",
+    "confidences": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Choosing the right data and model for a pre-defined task is one of the critical competencies in machine learning. Investigating what features of a dataset and its underlying distribution a model decodes may enlighten the mysterious \"black box\" and guide us to a deeper and more profound understanding of the ongoing processes. Furthermore, it will help to improve the quality of models which directly depend on data or learn from it through training. In this work, we introduce the dataset-dependent concept of sample robustness, which is based on a point-wise Lipschitz constant of the label map. For a particular sample, it measures how small of a perturbation is required to cause a label-change relative to the magnitude of the label map. We introduce theory to motivate the concept and to analyse the effects of having similar robustness distributions for the training- and test data. Afterwards, we conduct various experiments using different datasets and (non-)deterministic models. In some cases, we can boost performance by choosing specifically tailored training(sub)sets and hyperparameters depending on the robustness distribution of the test(sub)sets.",
+    "title": "Introducing Sample Robustness",
+    "authors": [
+      "Monty Maximilian Z\u00fchlke"
+    ],
+    "emails": [
+      "~Monty_Maximilian_Z\u00fchlke1"
+    ],
+    "rank": 2848
+  },
+  {
+    "url": "https://openreview.net/forum?id=b4Phn_aTm_e",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4
+    ],
+    "rating": "3.64",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Multi-task learning (MTL) for scene understanding has been actively studied by exploiting correlation of multiple tasks. This work focuses on improving the performance of the MTL network that infers depth and semantic segmentation maps from a single image. Specifically, we propose a novel MTL architecture, called Pseudo-MTL, that introduces pseudo labels for joint learning of monocular depth estimation and semantic segmentation tasks. The pseudo ground truth depth maps, generated from pretrained stereo matching methods, are leveraged to supervise the monocular depth estimation. More importantly, the pseudo depth labels serve to impose a cross-view consistency on the estimated monocular depth and segmentation maps of two views. This enables for mitigating the mismatch problem incurred by inconsistent prediction results across two views. A thorough ablation study validates that the cross-view consistency leads to a substantial performance gain by ensuring inference-view invariance for the two tasks.",
+    "title": "Pseudo Label-Guided Multi Task Learning for Scene Understanding",
+    "authors": [
+      "Sunkyung Kim",
+      "Hyesong Choi",
+      "Dongbo Min"
+    ],
+    "emails": [
+      "~Dongbo_Min3",
+      "~Sunkyung_Kim1",
+      "~Hyesong_Choi1"
+    ],
+    "rank": 2849
+  },
+  {
+    "url": "https://openreview.net/forum?id=PcUprce4TM2",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.64",
+    "confidences": [
+      2,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Private training data can be leaked through the gradient sharing mechanism deployed in machine learning systems, such as federated learning (FL).\nIncreasing batch size is often viewed as a promising defense strategy against data leakage. In this paper, we revisit this defense premise and propose an advanced data leakage attack to efficiently recover batch data from the shared aggregated gradients. \nWe name our proposed method as \\textit{\\underline{c}atastrophic d\\underline{a}ta leakage in \\underline{f}ederated l\\underline{e}arning (CAFE)}.\nComparing to existing data leakage attacks, CAFE demonstrates the ability to perform large-batch data leakage attack with high data recovery quality. \nExperimental results on vertical and horizontal FL settings have validated the effectiveness of CAFE in recovering private data from the shared aggregated gradients. \nOur results suggest that data participated in FL, especially the vertical case, have a high risk of being leaked from the training gradients. Our analysis implies unprecedented and practical data leakage risks in those learning settings.",
+    "title": "CAFE: Catastrophic Data Leakage in Federated Learning",
+    "authors": [
+      "Xiao Jin",
+      "Ruijie Du",
+      "Pin-Yu Chen",
+      "Tianyi Chen"
+    ],
+    "emails": [
+      "~Pin-Yu_Chen1",
+      "gmail.com",
+      "~Tianyi_Chen1"
+    ],
+    "rank": 2850
+  },
+  {
+    "url": "https://openreview.net/forum?id=3X4JzHq5fU5",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.64",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "The remarkable performance of modern deep learning methods depends critically on the optimization of their hyperparameters. One major challenge is that evaluating a single hyperparameter configuration on large datasets could nowadays easily exceed hours or days. For efficient sampling and fast evaluation, some previous works presented effective computing resource allocation schemes and built a Bayesian surrogate model to sample candidate hyperparameters. However, the model itself is not related to budgets which are set manually. To deal with this problem, a new Gaussian Process model involved in budgets is proposed. Further, for this model, an optimal design is constructed by the equivalence theorem to replace random search as an initial sampling strategy in the search space. Experiments demonstrate that the new model has the best performance among competing methods. Moreover, comparisons between different initial designs with the same model show the advantage of the proposed optimal design.",
+    "title": "Optimal Designs of Gaussian Processes with Budgets for Hyperparameter Optimization",
+    "authors": [
+      "Yimin Huang",
+      "Yujun Li",
+      "Zhenguo Li",
+      "Zhihua Zhang"
+    ],
+    "emails": [
+      "~Zhihua_Zhang1",
+      "~Yujun_Li1",
+      "~Yimin_Huang2",
+      "~Zhenguo_Li1"
+    ],
+    "rank": 2851
+  },
+  {
+    "url": "https://openreview.net/forum?id=QkQCcFsUtk",
+    "ratings": [
+      3,
+      4,
+      4
+    ],
+    "rating": "3.64",
+    "confidences": [
+      5,
+      4,
+      5
+    ],
+    "abstract": "Cross-lingual alignment of word embeddings play an important role in knowledge transfer across languages, for improving machine translation and other multi-lingual applications. Current unsupervised approaches rely on similarities in geometric structure of word embedding spaces across languages, to learn structure-preserving linear transformations using adversarial networks and refinement strategies. However, such techniques, in practice, tend to suffer from instability and convergence issues, requiring tedious fine-tuning for precise parameter setting. This paper proposes BioSpere, a novel framework for unsupervised mapping of bi-lingual word embeddings onto a shared vector space, by combining adversarial initialization and refinement procedure with point set registration algorithm used in image processing. We show that our framework alleviates the shortcomings of existing methodologies, and is relatively invariant to variable adversarial learning performance, depicting robustness in terms of parameter choices and training losses. Experimental evaluation on parallel dictionary induction task demonstrates state-of-the-art results for our framework on diverse language pairs.",
+    "title": "Unsupervised Word Translation Pairing using Refinement based Point Set Registration",
+    "authors": [
+      "Silviu Oprea",
+      "Sourav Dutta",
+      "Haytham Assem"
+    ],
+    "emails": [
+      "~Sourav_Dutta1",
+      "huawei.com",
+      "ed.ac.uk"
+    ],
+    "rank": 2852
+  },
+  {
+    "url": "https://openreview.net/forum?id=n76UrS2Gs3M",
+    "ratings": [
+      5,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.62",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Prediction bias in machine learning models, referring to undesirable model behaviors that discriminates inputs mentioning or produced by certain group, has drawn increasing attention from the research community given its societal impact. While a number of bias mitigation algorithms exist, it is often difficult and/or costly to apply them to a large number of downstream models due to the challenges on (sensitive) user data collection, expensive data annotation, and complications in algorithm implementation. In this paper, we present a new approach for creating less biased downstream models: transfer learning from a less biased upstream model. A model is trained with bias mitigation algorithms in the source domain and fine-tuned in the target domain without bias mitigation. By doing so, the framework allows to achieve less bias on downstream tasks in a more efficient, accessible manner. We conduct extensive experiments with the proposed framework under different levels of similarities between the source and target domain and the number of factors included for de-biasing. The results are positive, implying that less biased models can be obtained with our transfer learning framework.",
+    "title": "Efficient Learning of Less Biased Models with Transfer Learning",
+    "authors": [
+      "Xisen Jin",
+      "Francesco Barbieri",
+      "Leonardo Neves",
+      "Xiang Ren"
+    ],
+    "emails": [
+      "~Xiang_Ren1",
+      "~Leonardo_Neves1",
+      "~Xisen_Jin3",
+      "~Francesco_Barbieri1"
+    ],
+    "rank": 2853
+  },
+  {
+    "url": "https://openreview.net/forum?id=b4ach0lGuYO",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      6,
+      2,
+      4
+    ],
+    "rating": "3.62",
+    "confidences": [
+      3,
+      1,
+      5,
+      4
+    ],
+    "abstract": "Autoencoders have emerged as popular methods for unsupervised anomaly detection. Autoencoders trained on the normal data are expected to reconstruct only the normal features, allowing anomaly detection by thresholding reconstruction errors. However, in practice, autoencoders fail to model small detail and yield blurry reconstructions, which makes anomaly detection challenging. Moreover, there is objective mismatching that models are trained to minimize total reconstruction errors while expecting a small deviation on normal pixels and a large deviation on anomalous pixels. To tackle these two issues, we propose the iterative image inpainting method that reconstructs partial regions in an adaptive inpainting mask matrix. This method constructs inpainting masks from the anomaly score of structural similarity. Overlaying inpainting mask on images, each pixel is bypassed or reconstructed based on the anomaly score, enhancing reconstruction quality. The iterative update of inpainted images and masks by turns purifies the anomaly score directly and follows the expected objective at test time. We evaluated the proposed method using the MVTec Anomaly Detection dataset. Our method outperformed previous state-of-the-art in several categories and showed remarkable improvement in high-frequency textures.",
+    "title": "Iterative Image Inpainting with Structural Similarity Mask for Anomaly Detection",
+    "authors": [
+      "Hitoshi Nakanishi",
+      "Masahiro Suzuki",
+      "Yutaka Matsuo"
+    ],
+    "emails": [
+      "~Yutaka_Matsuo1",
+      "~Hitoshi_Nakanishi1",
+      "~Masahiro_Suzuki1"
+    ],
+    "rank": 2854
+  },
+  {
+    "url": "https://openreview.net/forum?id=3UTezOEABr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4
+    ],
+    "rating": "3.62",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Multivariate time series (MTS) data are becoming increasingly ubiquitous in diverse domains, e.g., IoT systems, health informatics, and 5G networks. To obtain an effective representation of MTS data, it is not only essential to consider unpredictable dynamics and highly variable lengths of these data but also important to address the irregularities in the sampling rates of MTS. Existing parametric approaches rely on manual hyperparameter tuning and may cost a huge amount of labor effort. Therefore, it is desirable to learn the representation automatically and efficiently. To this end, we propose an autonomous representation learning approach for multivariate time series (TimeAutoML) with irregular sampling rates and variable lengths. As opposed to previous works, we first present a representation learning pipeline in which the configuration and hyperparameter optimization\nare fully automatic and can be tailored for various tasks, e.g., anomaly detection, clustering, etc. Next, a negative sample generation approach and an auxiliary classification task are developed and integrated within TimeAutoML to enhance\nits representation capability. Extensive empirical studies on real-world datasets demonstrate that the proposed TimeAutoML outperforms competing approaches on various tasks by a large margin. In fact, it achieves the best anomaly detection\nperformance among all comparison algorithms on 78 out of all 85 UCR datasets, acquiring up to 20% performance improvement in terms of AUC score.",
+    "title": "TimeAutoML: Autonomous Representation Learning for Multivariate Irregularly  Sampled Time Series",
+    "authors": [
+      "Yang Jiao",
+      "Kai Yang",
+      "shaoyu dou",
+      "pan luo",
+      "Sijia Liu",
+      "Dongjin Song"
+    ],
+    "emails": [
+      "~Sijia_Liu1",
+      "~Kai_Yang3",
+      "~Dongjin_Song2",
+      "tongji.edu.cn"
+    ],
+    "rank": 2855
+  },
+  {
+    "url": "https://openreview.net/forum?id=om1guSP_ray",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      5,
+      4
+    ],
+    "rating": "3.59",
+    "confidences": [
+      5,
+      5,
+      3,
+      4
+    ],
+    "abstract": "Graph neural networks (GNNs) are very efficient at solving several tasks in graphs such as node classification or graph classification. They come from an adaptation of convolutional neural networks on images to graph structured data. These models are very effective at finding patterns in images that can discriminate images from each others. Another aspect leading to their success is their ability to uncover hierarchical structures. This comes from the pooling operation that produces different versions of the input image at different scales. The same way, we want to identify patterns at different scales in graphs in order to improve the classification accuracy. Compared to the case of images, it is not trivial to develop a pooling layer on graphs. This is mainly due to the fact that in graphs nodes are not ordered and have irregular neighborhoods. To aleviate this issue, we propose a pooling layer based on edge cuts in graphs. This pooling layer works by computing edge scores that correspond to the importance of edges in the process of information propagation of the GNN. Moreover, we define a regularization function that aims at producing edge scores that minimize the minCUT problem. Finally, through extensive experiments we show that this architecture can compete with state-of- the-art methods.",
+    "title": "Graph Pooling by Edge Cut",
+    "authors": [
+      "Alexis Galland",
+      "marc lelarge"
+    ],
+    "emails": [
+      "~Alexis_Galland1",
+      "~marc_lelarge1"
+    ],
+    "rank": 2856
+  },
+  {
+    "url": "https://openreview.net/forum?id=jn1WDxmDe5P",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.58",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Data clustering is a well-known unsupervised learning approach. Despite the recent advances in clustering using deep neural networks, determining the number of clusters without any information about the given dataset remains an existing problem. There have been classical approaches based on data statistics that require the manual analysis of a data scientist to calculate the probable number of clusters in a dataset. In this work, we propose a new method for unsupervised prediction of the number of clusters in a dataset given only the data without any labels. We evaluate our method extensively on randomly generated datasets using the scikit-learn package and multiple computer vision datasets and show that our method is able to determine the number of classes in a dataset effectively without any supervision.",
+    "title": "Meta-k: Towards Unsupervised Prediction of Number of Clusters",
+    "authors": [
+      "Azade Farshad",
+      "Samin Hamidi",
+      "Nassir Navab"
+    ],
+    "emails": [
+      "studenti.uniroma1.it",
+      "~Nassir_Navab1",
+      "~Azade_Farshad1"
+    ],
+    "rank": 2857
+  },
+  {
+    "url": "https://openreview.net/forum?id=P3WG6p6Jnb",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3
+    ],
+    "rating": "3.58",
+    "confidences": [
+      3,
+      4,
+      5
+    ],
+    "abstract": "Learning policies from fixed offline datasets is a key challenge to scale up reinforcement learning (RL) algorithms towards practical applications. This is often because off-policy RL algorithms suffer from distributional shift, due to mismatch between dataset and the target policy, leading to high variance and over-estimation of value functions. In this work, we propose variance regularization for offline RL algorithms, using stationary distribution corrections. We show that by using Fenchel duality, we can avoid double sampling issues for computing the gradient of the variance regularizer. The proposed algorithm for offline variance regularization can be used to augment  any existing offline policy optimization algorithms. We  show that the regularizer leads to a lower bound to the offline policy optimization objective, which can help avoid over-estimation errors, and explains the benefits of our approach  across a range of continuous control domains when compared to existing algorithms. ",
+    "title": "Offline Policy Optimization with Variance Regularization",
+    "authors": [
+      "Riashat Islam",
+      "Samarth Sinha",
+      "Homanga Bharadhwaj",
+      "Samin Yeasar Arnob",
+      "Zhuoran Yang",
+      "Zhaoran Wang",
+      "Animesh Garg",
+      "Lihong Li",
+      "Doina Precup"
+    ],
+    "emails": [
+      "~Riashat_Islam1",
+      "~Zhuoran_Yang1",
+      "~Doina_Precup1",
+      "~Lihong_Li1",
+      "~Samin_Yeasar_Arnob1",
+      "~Animesh_Garg1",
+      "~Samarth_Sinha1",
+      "~Zhaoran_Wang1",
+      "~Homanga_Bharadhwaj1"
+    ],
+    "rank": 2858
+  },
+  {
+    "url": "https://openreview.net/forum?id=4jJI8Sqwz9V",
+    "ratings": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.57",
+    "confidences": [
+      2,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The instability when training  generative adversarial networks (GANs) is a notoriously difficult  issue, and  the generalization of GANs remains open. In this paper, we will analyze various sources of instability which not only come from the discriminator but also the generator.  We then point out that the requirement of Lipschitz continuity on both the discriminator and generator leads to generalization and stability for GANs. As a consequence, this work naturally provides a generalization bound for a large class of existing models and explains the success of recent large-scale generators. Finally, we show why  data augmentation can ensure Lipschitz continuity on both the discriminator and generator. This work therefore provides a theoretical basis for a simple way to ensure  generalization in GANs, explaining  the highly successful use of data augmentation for GANs in practice.",
+    "title": "Generalization and Stability of GANs: A theory and promise from data augmentation",
+    "authors": [
+      "Khoat Than",
+      "Nghia Vu"
+    ],
+    "emails": [
+      "~Khoat_Than1",
+      "gmail.com"
+    ],
+    "rank": 2859
+  },
+  {
+    "url": "https://openreview.net/forum?id=NrN8XarA2Iz",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "rating": "3.56",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Reinforcement learning (RL) algorithms often have the limitation of sample complexity. Previous research has shown that the reliance on large amounts of experience can be mitigated through the presence of additional feedback. Automatic reward shaping is one approach to solving this problem, using automatic identification and modulation of shaping reward signals that are more informative about how agents should behave in any given scenario to learn and adapt faster. However, automatic reward shaping is still very challenging. To better study it, we break it down into two separate sub-problems: learning shaping reward signals in an application and learning how the signals can be adaptively used to provide a single reward feedback in the RL learning process. This paper focuses on the latter sub-problem. Unlike existing research, which tries to learn one shaping reward function from shaping signals, the proposed method learns to dynamically select the right reward signal to apply at each state, which is considerably more flexible. We further show that using an online strategy that seeks to match the learned shaping feedback with optimal value differences can lead to effective reward shaping and accelerated learning. The proposed ideas are verified through experiments in a variety of environments using different shaping reward paradigms.",
+    "title": "Learning to Dynamically Select Between Reward Shaping Signals",
+    "authors": [
+      "Alexander Politowicz",
+      "Bing Liu"
+    ],
+    "emails": [
+      "~Alexander_Politowicz1",
+      "~Bing_Liu1"
+    ],
+    "rank": 2860
+  },
+  {
+    "url": "https://openreview.net/forum?id=TRgh1LjcBvt",
+    "ratings": [
+      5,
+      3,
+      4,
+      2
+    ],
+    "rating": "3.56",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Denoising is a fundamental challenge in scientific imaging. Deep convolutional neural networks (CNNs) provide the current state of the art in denoising natural images, where they produce impressive results. However, their potential has barely been explored in the context of scientific imaging. Denoising CNNs are typically trained on real natural images artificially corrupted with simulated noise. In contrast, in scientific applications, noiseless ground-truth images are usually not available. To address this issue, we propose a simulation-based denoising (SBD) framework, in which CNNs are trained on simulated images. We test the framework on data obtained from transmission electron microscopy (TEM), an imaging technique with widespread applications in material science, biology, and medicine. SBD outperforms existing techniques by a wide margin on a simulated benchmark dataset, as well as on real data. Apart from the denoised images, SBD generates likelihood maps to visualize the agreement between the structure of the denoised image and the observed data. Our results reveal shortcomings of state-of-the-art denoising architectures, such as their small field-of-view. Through a gradient-based analysis, we show that substantially increasing the field-of-view of the CNNs allows them to exploit non-local periodic patterns in the data, which is crucial at high noise levels. In addition, we perform a thorough analysis of the generalization capability of SBD, demonstrating that the trained networks are robust to variations of imaging parameters and of the underlying signal structure. Finally, we release the first publicly available benchmark dataset of TEM images, containing 18,000 examples.",
+    "title": "Deep Denoising for Scientific Discovery: A Case Study in Electron Microscopy",
+    "authors": [
+      "Sreyas Mohan",
+      "Ram\u00f3n Manzorro",
+      "Joshua Vincent",
+      "Binh Tang",
+      "Dev Yashpal Sheth",
+      "Eero Peter Simoncelli",
+      "David S. Matteson",
+      "Peter Crozier",
+      "Carlos Fernandez-Granda"
+    ],
+    "emails": [
+      "~Binh_Tang1",
+      "~Eero_Peter_Simoncelli1",
+      "~Carlos_Fernandez-Granda1",
+      "~David_S._Matteson1",
+      "~Peter_Crozier1",
+      "cse.iitm.ac.in",
+      "asu.edu",
+      "~Sreyas_Mohan1",
+      "uca.es"
+    ],
+    "rank": 2861
+  },
+  {
+    "url": "https://openreview.net/forum?id=JNtw9rUJnV",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2,
+      4,
+      4
+    ],
+    "rating": "3.55",
+    "confidences": [
+      4,
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "We present a new zero-shot approach to automated machine learning (AutoML) that predicts a high-quality model for a supervised learning task and dataset in real-time without fitting a single model. In contrast, most AutoML systems require tens or hundreds of model evaluations. Hence our approach accelerates AutoML by orders of magnitude. Our method uses a transformer-based language embedding to represent datasets and algorithms using their free-text descriptions and a meta-feature extractor to represent the data. We train a graph neural network in which each node represents a dataset to predict the best machine learning pipeline for a new test dataset. The graph neural network generalizes to new datasets and new sets of datasets. Our approach leverages the progress of unsupervised representation learning in natural language processing to provide a significant boost to AutoML. Performance is competitive with state-of-the-art AutoML systems while reducing running time from minutes to seconds and prediction time from minutes to milliseconds, providing AutoML in real-time.",
+    "title": "Real-Time AutoML",
+    "authors": [
+      "Iddo Drori",
+      "Brandon Kates",
+      "Anant Kharkar",
+      "Lu Liu",
+      "Qiang Ma",
+      "Jonah Deykin",
+      "Nihar Sidhu",
+      "Madeleine Udell"
+    ],
+    "emails": [
+      "cornell.edu",
+      "~Iddo_Drori1",
+      "columbia.edu",
+      "~Qiang_Ma3",
+      "~Madeleine_Udell1"
+    ],
+    "rank": 2862
+  },
+  {
+    "url": "https://openreview.net/forum?id=EKw6nZ4QkJl",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.53",
+    "confidences": [
+      4,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Knowledge graph completion aims to predict the new links in given entities among the knowledge graph (KG). Most mainstream embedding methods focus on fact triplets contained in the given KG, however, ignoring the rich background information provided by logic rules driven from knowledge base implicitly. To solve this problem, in this paper, we propose a general framework, named EM-RBR(embedding and rule-based reasoning), capable of combining the advantages of reasoning based on rules and the state-of-the-art models of embedding. EM-RBR aims to utilize relational background knowledge contained in rules to conduct multi-relation reasoning link prediction rather than superficial vector triangle linkage in embedding models. By this way, we can explore relation between two entities in deeper context to achieve higher accuracy. In experiments, we demonstrate that EM-RBR achieves better performance compared with previous models on FB15k, WN18 and our new dataset FB15k-R, especially the new dataset where our model perform futher better than those state-of-the-arts. We make the implementation of EM-RBR available at <a href=\"https://github.com/1173710224/link-prediction-with-rule-based-reasoning\" target=\"_blank\" rel=\"nofollow\">https://github.com/1173710224/link-prediction-with-rule-based-reasoning</a>.",
+    "title": "EM-RBR: a reinforced framework for knowledge graph completion from reasoning perspective",
+    "authors": [
+      "Bozhou Chen",
+      "Zhaochong An",
+      "Houde Quan",
+      "Qihui Lin",
+      "Hongzhi Wang"
+    ],
+    "emails": [
+      "~Houde_Quan1",
+      "~Zhaochong_An1",
+      "~Qihui_Lin1",
+      "~Bozhou_Chen1",
+      "~Hongzhi_Wang2"
+    ],
+    "rank": 2863
+  },
+  {
+    "url": "https://openreview.net/forum?id=EQtwFlmq7mx",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3,
+      4
+    ],
+    "rating": "3.53",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We revisit the stochastic proximal point algorithm (SPPA) for large-scale nonconvex optimization problems. SPPA has been shown to converge faster and more stable than the celebrated stochastic gradient descent (SGD) algorithm, and its many variations, for convex problems. However, the per-iteration update of SPPA is defined abstractly and has long been considered expensive. In this paper, we show that efficient implementation of SPPA can be achieved. If the problem is a nonlinear least squares, each iteration of SPPA can be efficiently implemented by Gauss-Newton; with some linear algebra trick the resulting complexity is in the same order of SGD. For more generic problems, SPPA can still be implemented with L-BFGS or accelerated gradient with high efficiency. Another contribution of this work is the convergence of SPPA to a stationary point in expectation for nonconvex problems. The result is encouraging that it admits more flexible choices of the step sizes under similar assumptions. The proposed algorithm is elaborated for both regression and classification problems using different neural network structures. Real data experiments showcase its effectiveness in terms of convergence and accuracy compared to SGD and its variants.",
+    "title": "Stochastic Proximal Point Algorithm for Large-scale Nonconvex Optimization: Convergence, Implementation, and Application to Neural Networks",
+    "authors": [
+      "Aysegul Bumin",
+      "Kejun Huang"
+    ],
+    "emails": [
+      "~Kejun_Huang1",
+      "ufl.edu"
+    ],
+    "rank": 2864
+  },
+  {
+    "url": "https://openreview.net/forum?id=GzMUD_GGvJN",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.53",
+    "confidences": [
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "Representation Learning methods can allow the application of Reinforcement Learning algorithms when a high dimensionality in a robot's perceptions would otherwise prove prohibitive. Consequently, unsupervised Representation Learning components often feature in robot control algorithms that assume high-dimensional camera images as the principal source of information.\nIn their design and performance, these algorithms often benefit from the controlled nature of the simulation or laboratory conditions they are evaluated in. However, these settings fail to acknowledge the stochasticity of most real-world environments.\nIn this work, we introduce the concept of Distraction-Robust Representation Learning. We argue that environment noise and other distractions require learned representations to encode the robot's expected perceptions rather than the observed ones. Our experimental evaluations demonstrate that representations learned with a traditional dimensionality reduction algorithm are strongly susceptible to distractions in a robot's environment.\nWe propose an Encoder-Decoder architecture that produces representations that allow the learning outcomes of robot control tasks to remain unaffected by these distractions.",
+    "title": "On the Importance of Distraction-Robust Representations for Robot Learning",
+    "authors": [
+      "Andy Wang",
+      "Antoine Cully"
+    ],
+    "emails": [
+      "~Antoine_Cully1",
+      "~Andy_Wang1"
+    ],
+    "rank": 2865
+  },
+  {
+    "url": "https://openreview.net/forum?id=mzfqkPOhVI4",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      3,
+      3
+    ],
+    "rating": "3.53",
+    "confidences": [
+      5,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Spatial-temporal data forecasting is of great importance for industries such as telecom network operation and transportation management. However, spatial-temporal data is inherent with complex spatial-temporal correlations and behaves heterogeneities among the spatial and temporal aspects, which makes the forecasting remain as a very challenging task though recently great work has been done. In this paper, we propose a novel model, Adaptive Spatial-Temporal Inception Graph Convolution Networks (ASTI-GCN), to solve the multi-step spatial-temporal data forecasting problem. The model proposes multi-scale spatial-temporal joint graph convolution block to directly model the spatial-temporal joint correlations without introducing elaborately constructed mechanisms. Moreover inception mechanism combined with the graph node-level attention is introduced to make the model capture the heterogeneous nature of the graph adaptively. Our experiments on three real-world datasets from two different fields consistently show ASTI-GCN outperforms the state-of-the-art performance. In addition, ASTI-GCN is proved to generalize well. ",
+    "title": "Adaptive Spatial-Temporal Inception Graph Convolutional Networks for Multi-step Spatial-Temporal Network Data Forecasting",
+    "authors": [
+      "Xing Wang",
+      "Lin Zhu",
+      "Juan Zhao",
+      "Zhou Xu",
+      "Zhao Li",
+      "Junlan Feng",
+      "Chao Deng"
+    ],
+    "emails": [
+      "~Juan_Zhao2",
+      "~Zhou_Xu1",
+      "~Chao_Deng3",
+      "~Zhao_Li4",
+      "~Lin_Zhu5",
+      "~Junlan_Feng2",
+      "~Xing_Wang4"
+    ],
+    "rank": 2866
+  },
+  {
+    "url": "https://openreview.net/forum?id=3b76QBOlYW",
+    "ratings": [
+      3,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.53",
+    "confidences": [
+      4,
+      4,
+      2,
+      5
+    ],
+    "abstract": "Computer generated holography (CGH) aims to generate phase plates that create an intensity pattern at a certain distance behind the holography plate when illuminated. Since only the intensity and not the phase of the wave is of interest, this is an ill-defined inverse problem. Usually these problems are tackled by iterative optimization algorithms which are part of the convex optimization framework. These algorithms essentially minimize a loss using a forward model. Even though many of the tackled inverse problems are non-convex, these algorithms reach acceptable solutions by finding a local minimum. The ability of Deep Neural Networks to estimate a large range of functions has made a different approach to these problems possible. Instead of an iterative optimization algorithm that converges to a (sub-)optimal solution, the inverse problem can be solved by training a neural network to directly estimate the inverse operator. However simple convolutional neural networks tend to overfit when learning the inverse operator and do not generalize well outside the training distribution. Therefore this paper introduces a hybrid approach that can be interpreted as an unrolled Gerchberg-Saxton algorithm, which we term Learned Residual Gerchberg-Saxton (LRGS) network. We train this network for the generation of multi-focus computer generated holograms, and beat state-of-the-art existing methods.",
+    "title": "Learned residual Gerchberg-Saxton network for computer generated holography",
+    "authors": [
+      "Lennart Schlieder",
+      "Heiner Kremer",
+      "Valentin Volchkov",
+      "Kai Melde",
+      "Peer Fischer",
+      "Bernhard Sch\u00f6lkopf"
+    ],
+    "emails": [
+      "~Bernhard_Sch\u00f6lkopf1",
+      "~Lennart_Schlieder1",
+      "tuebinge.mpg.de",
+      "is.mpg.de",
+      "tuebingen.mpg.de"
+    ],
+    "rank": 2867
+  },
+  {
+    "url": "https://openreview.net/forum?id=4hA23Eld-HU",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.53",
+    "confidences": [
+      5,
+      4,
+      5,
+      3
+    ],
+    "abstract": "This paper proposes an algorithm which learns to control on the fly. The proposed algorithm has no access to the transition law of the environment, which is actually linear with bounded random noise, and learns to make decisions directly online without training phases or sub-optimal policies as the initial input. Neither estimating the system parameters nor the value functions online, the proposed algorithm adapts the ellipsoid method into the online decision making setting. By adding linear constraints when the feasibility of the decision variable is violated, the volume of the decision variable domain can be collapsed and we upper bound the number of online linear constraints needed for the convergence of the state to be around the desired state under the bounded random state noise. The algorithm is also proved to be of constant bounded online regret given certain range of the bound of the random noise.",
+    "title": "Learning to Control on the Fly",
+    "authors": [
+      "Zhanzhan Zhao"
+    ],
+    "emails": [
+      "~Zhanzhan_Zhao1"
+    ],
+    "rank": 2868
+  },
+  {
+    "url": "https://openreview.net/forum?id=3NG1WgOn0y2",
+    "ratings": [
+      5,
+      5,
+      2,
+      3
+    ],
+    "rating": "3.53",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Areal spatial data represent not only geographical locations but also sizes and shapes of physical objects such as buildings in a city. Data-driven generation of such vector-format data requires an effective representation. Inspired by the hierarchical nature of such spatial data, we propose AETree, a tree-based deep auto-encoder network. Unlike common strategies that either treat the data as an unordered set or sort them into a sequence, we preprocess the data into a binary tree via hierarchical clustering. Then a tree encoder learns to extract and merge spatial information from bottom-up iteratively. The resulting global representation is reversely decoded for reconstruction or generation. Experiments on large scale 2D/3D building datasets of both New York and Zurich showed superior performance of AETree than either set-based or sequential auto-regressive deep models.",
+    "title": "AETree: Areal Spatial Data Generation",
+    "authors": [
+      "Congcong Wen",
+      "Wenyu Han",
+      "Hang Zhao",
+      "Chen Feng"
+    ],
+    "emails": [
+      "~Hang_Zhao1",
+      "nyu.edu",
+      "~Chen_Feng2"
+    ],
+    "rank": 2869
+  },
+  {
+    "url": "https://openreview.net/forum?id=J4XaMT9OcZ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      3,
+      2,
+      4
+    ],
+    "rating": "3.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "The double descent curve is one of the most intriguing properties of deep neural networks. It contrasts the classical bias-variance curve with the behavior of modern neural networks, occurring where the number of samples nears the number of parameters. In this work, we explore the connection between the double descent phenomena and the number of samples in the deep neural network setting. In particular, we propose a construction which augments the existing dataset by artificially increasing the number of samples. This construction empirically mitigates the double descent curve in this setting. We reproduce existing work on deep double descent, and observe a smooth descent into the overparameterized region for our construction. This occurs both with respect to the model size, and with respect to the number epochs.",
+    "title": "Mitigating Deep Double Descent by Concatenating Inputs",
+    "authors": [
+      "John Chen",
+      "Qihan Wang",
+      "Anastasios Kyrillidis"
+    ],
+    "emails": [
+      "~John_Chen3",
+      "~Qihan_Wang1",
+      "~Anastasios_Kyrillidis2"
+    ],
+    "rank": 2870
+  },
+  {
+    "url": "https://openreview.net/forum?id=jpDaS6jQvcr",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "abstract": "Unsupervised anomaly detection plays a crucial role in many critical applications. Driven by the success of deep learning, recent years have witnessed growing interests in applying deep neural networks (DNNs) to anomaly detection problems. A common approach is to use autoencoders to learn a feature representation for the normal (non-anomalous) observations in the data. The reconstruction error of the autoencoder is then used as outlier scores to detect anomalies. However, due to the high complexity brought upon by over-parameterization of DNNs, the reconstruction error of the anomalies could also be small, which hampers the effectiveness of these methods. To alleviate this problem, we propose a robust framework using collaborative autoencoders to jointly identify normal observations from the data while learning its feature representation. We investigate the theoretical properties of the framework and empirically show its outstanding performance as compared to other DNN-based methods. Our experimental results also show the resiliency of the framework to missing values compared to other baseline methods.",
+    "title": "Unsupervised Anomaly Detection by Robust Collaborative Autoencoders",
+    "authors": [
+      "Boyang Liu",
+      "Ding Wang",
+      "Kaixiang Lin",
+      "Pang-Ning Tan",
+      "Jiayu Zhou"
+    ],
+    "emails": [
+      "msu.edu",
+      "~Boyang_Liu1",
+      "~Pang-Ning_Tan1",
+      "~Jiayu_Zhou1",
+      "~Kaixiang_Lin1"
+    ],
+    "rank": 2871
+  },
+  {
+    "url": "https://openreview.net/forum?id=SVP44gujOBL",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.50",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "In practice, sequence of mini-batches generated by uniform sampling of examples from the entire data is used for training neural networks. Curriculum learning is a training strategy that sorts the training examples by their difficulty and gradually exposes them to the learner. In this work, we propose two novel curriculum learning algorithms and empirically show their improvements in performance with convolutional and fully-connected neural networks on multiple real image datasets. Our dynamic curriculum learning algorithm tries to reduce the distance between the network weight and an optimal weight at any training step by greedily sampling examples with gradients that are directed towards the optimal weight. The curriculum ordering determined by our dynamic algorithm achieves a training speedup of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>45</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> in our experiments. We also introduce a new task-specific curriculum learning strategy that uses statistical measures such as standard deviation and entropy values to score the difficulty of data points in natural image datasets. We show that this new approach yields a mean training speedup of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c223C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c34\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u223c</mo><mn>43</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> in the experiments we perform. Further, we also use our algorithms to learn why curriculum learning works. Based on our study, we argue that curriculum learning removes noisy examples from the initial phases of training, and gradually exposes them to the learner acting like a regularizer that helps in improving the generalization ability of the learner.",
+    "title": "A Simple Approach To Define Curricula For Training Neural Networks",
+    "authors": [
+      "Vinu Sankar Sadasivan",
+      "Anirban Dasgupta"
+    ],
+    "emails": [
+      "~Vinu_Sankar_Sadasivan1",
+      "~Anirban_Dasgupta1"
+    ],
+    "rank": 2872
+  },
+  {
+    "url": "https://openreview.net/forum?id=Lq1srMWfUAi",
+    "ratings": [
+      3,
+      2,
+      6
+    ],
+    "rating": "3.50",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "We present a novel algorithm -- convex natural evolutionary strategies (CoNES) -- for optimizing high-dimensional blackbox functions by leveraging tools from convex optimization and information geometry. CoNES is formulated as an efficiently-solvable convex program that adapts the evolutionary strategies (ES) gradient estimate to promote rapid convergence. The resulting algorithm is invariant to the parameterization of the belief distribution. Our numerical results demonstrate that CoNES vastly outperforms conventional blackbox optimization methods on a suite of functions used for benchmarking blackbox optimizers. Furthermore, CoNES demonstrates the ability to converge faster than conventional blackbox methods on a selection of OpenAI's MuJoCo reinforcement learning tasks for locomotion.",
+    "title": "CoNES: Convex Natural Evolutionary Strategies",
+    "authors": [
+      "Sushant Veer",
+      "Anirudha Majumdar"
+    ],
+    "emails": [
+      "~Anirudha_Majumdar1",
+      "~Sushant_Veer1"
+    ],
+    "rank": 2873
+  },
+  {
+    "url": "https://openreview.net/forum?id=wKfXaxPist",
+    "ratings": [
+      3,
+      5,
+      3
+    ],
+    "rating": "3.50",
+    "confidences": [
+      4,
+      3,
+      5
+    ],
+    "abstract": "Large-scale pre-trained language models have shown remarkable results in diverse NLP applications. Unfortunately, these performance gains have been accompanied by a significant increase in computation time and model size, stressing the need to develop new or complementary strategies to increase the efficiency and interpretability of current large language models, such as BERT. In this paper we propose DACT-BERT, a differentiable adaptive computation time strategy for BERT language model. DACT-BERT adds an adaptive computation mechanism to the regular processing pipeline of BERT. This mechanism controls the number of transformer blocks that BERT needs to execute at inference time. By doing this, the model makes predictions based on the most appropriate intermediate representations for the task encoded by the pre-trained weights. With respect to previous works, DACT-BERT has the advantage of being fully differentiable and directly integrated to BERT's main processing pipeline. This enables the incorporation of gradient-based transparency mechanisms to improve interpretability. Furthermore, by discarding useless steps, DACT-BERT facilitates the understanding of the underlying process used by BERT to reach an inference. Our experiments demonstrate that our approach is effective in significantly reducing computational complexity without affecting model accuracy. Additionally, they also demonstrate that DACT-BERT helps to improve model interpretability. ",
+    "title": "DACT-BERT: Increasing the efficiency and interpretability of BERT by using adaptive computation time.",
+    "authors": [
+      "Cristobal Eyzaguirre",
+      "Felipe del Rio",
+      "Vladimir Araujo",
+      "Alvaro Soto"
+    ],
+    "emails": [
+      "uc.cl",
+      "~Alvaro_Soto1",
+      "~Felipe_del_Rio1",
+      "~Cristobal_Eyzaguirre1"
+    ],
+    "rank": 2874
+  },
+  {
+    "url": "https://openreview.net/forum?id=W75l6XMzLq",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In multi-goal tasks with sparse rewards, it is challenging to learn from tons of experiences with zero rewards. Hindsight experience replay (HER), which replays past experiences with additional heuristic goals, has shown it possible for off-policy reinforcement learning (RL) to make use of failed experiences. However, the replayed experiences may not lead to well-explored state-action pairs, especially for a pseudo goal, which instead results in a poor estimate of the value function. To tackle the problem, we propose to resample hindsight experiences based on their likelihood under the current policy and the overall distribution. Based on the hindsight strategy, we introduce a novel multi-goal experience replay method that automatically generates a training curriculum, namely Hindsight Curriculum Generation (HCG). As the range of experiences expands, the generated curriculum strikes a dynamic balance between exploiting and exploring. We implement HCG with the vanilla Deep Deterministic Policy Gradient(DDPG), and experiments on several tasks with sparse binary rewards demonstrate that HCG improves sample efficiency of the state of the art.",
+    "title": "Hindsight Curriculum Generation Based Multi-Goal Experience Replay",
+    "authors": [
+      "Xiaoyun Feng"
+    ],
+    "emails": [
+      "~Xiaoyun_Feng1"
+    ],
+    "rank": 2875
+  },
+  {
+    "url": "https://openreview.net/forum?id=rvosiWfMoMR",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      5
+    ],
+    "rating": "3.50",
+    "confidences": [
+      5,
+      3,
+      4
+    ],
+    "abstract": "When talking about computer-based music generation, two are the main threads of research: the construction of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">autonomous music-making systems</mtext></math></mjx-assistive-mml></mjx-container>, and the design of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D44F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D463 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">computer-based environments to assist musicians</mtext></math></mjx-assistive-mml></mjx-container>. However, even though creating accompaniments for melodies is an essential part of every producer's and songwriter's work, little effort has been done in the field of automatic music arrangement in the audio domain. In this contribution, we propose a novel framework for <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">automatic music accompaniment</mtext></math></mjx-assistive-mml></mjx-container> <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D453 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D450 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D466 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">in the Mel-frequency domain</mtext></math></mjx-assistive-mml></mjx-container>. Using several songs converted into Mel-spectrograms, a two-dimensional time-frequency representation of audio signals, we were able to automatically generate original arrangements for both bass and voice lines. Treating music pieces as images (Mel-spectrograms) allowed us to reformulate our problem as an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mtext class=\"mjx-i\"><mjx-c class=\"mjx-c1D462 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c2D TEX-MI\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c><mjx-c class=\"mjx-c20\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mtext></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mtext mathvariant=\"italic\">unpaired image-to-image translation</mtext></math></mjx-assistive-mml></mjx-container> problem, and to tackle it with CycleGAN, a well-established framework. Moreover, the choice to deploy raw audio and Mel-spectrograms enabled us to more effectively model long-range dependencies, to better represent how humans perceive music, and to potentially draw sounds for new arrangements from the vast collection of music recordings accumulated in the last century. Our approach was tested on two different downstream tasks: given a bass line creating credible and on-time drums, and given an acapella song arranging it to a full song. In absence of an objective way of evaluating the output of music generative systems, we also defined a possible metric for the proposed task, partially based on human (and expert) judgment.",
+    "title": "Automatic Music Production Using Generative Adversarial Networks",
+    "authors": [
+      "Giorgio Barnab\u00f2",
+      "Giovanni Trappolini",
+      "Lorenzo Lastilla",
+      "Cesare Campagnano",
+      "Angela Fan",
+      "Fabio Petroni",
+      "Fabrizio Silvestri"
+    ],
+    "emails": [
+      "~Giovanni_Trappolini1",
+      "~Angela_Fan2",
+      "~Giorgio_Barnab\u00f21",
+      "~Fabio_Petroni1",
+      "~Lorenzo_Lastilla1",
+      "~Fabrizio_Silvestri2",
+      "studenti.uniroma1.it"
+    ],
+    "rank": 2876
+  },
+  {
+    "url": "https://openreview.net/forum?id=boZj4g3Jocj",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.50",
+    "confidences": [
+      5,
+      3,
+      5,
+      3
+    ],
+    "abstract": "The human imagination is an integral component of our intelligence. Furthermore, the core utility of our imagination is deeply coupled with communication. Language, argued to have been developed through complex interaction within growing collective societies serves as an instruction to the imagination, giving us the ability to share abstract mental representations and perform joint spatiotemporal planning. In this paper, we explore communication through imagination with multi-agent reinforcement learning. Specifically, we develop a model-based approach where agents jointly plan through recurrent communication of their respective predictions of the future. Each agent has access to a learned world model capable of producing model rollouts of future states and predicted rewards, conditioned on the actions sampled from the agent's policy. These rollouts are then encoded into messages and used to learn a communication protocol during training via differentiable message passing. We highlight the benefits of our model-based approach, compared to a set of strong baselines, by developing a set of specialised experiments using novel as well as well-known multi-agent environments.",
+    "title": "Learning to communicate through imagination with model-based deep multi-agent reinforcement learning",
+    "authors": [
+      "Arnu Pretorius",
+      "Scott Cameron",
+      "Andries Petrus Smit",
+      "Elan van Biljon",
+      "Lawrence Francis",
+      "Femi Azeez",
+      "Alexandre Laterre",
+      "Karim Beguir"
+    ],
+    "emails": [
+      "~Alexandre_Laterre1",
+      "~Arnu_Pretorius1",
+      "instadeep.com",
+      "gmail.com",
+      "live.co.uk"
+    ],
+    "rank": 2877
+  },
+  {
+    "url": "https://openreview.net/forum?id=u15gHPQViL",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.50",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We present a new visual-semantic embedding method for generalized zero-shot learning. Existing embedding-based methods aim to learn the correspondence between an image classifier (visual representation) and its class prototype (semantic representation) for each class. Inspired by the binary relevance method for multi-label classification, we learn the mapping between an image and its semantic classifier. Given an input image, the proposed Image-Guided Semantic Classification (IGSC) method creates a label classifier, being applied to all label embeddings to determine whether a label belongs to the input image. Therefore, a semantic classifier is image conditioned and is generated during inference. We also show that IGSC is a unifying framework for two state-of-the-art deep-embedding methods. We validate our approach with four standard benchmark datasets.\n\n",
+    "title": "Zero-Shot Recognition through Image-Guided Semantic Classification",
+    "authors": [
+      "Mei-Chen Yeh",
+      "Fang Li",
+      "Bo-Heng Li"
+    ],
+    "emails": [
+      "~Mei-Chen_Yeh1",
+      "~Fang_Li5",
+      "gmail.com"
+    ],
+    "rank": 2878
+  },
+  {
+    "url": "https://openreview.net/forum?id=6fb4mex_pUT",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.47",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Deep neural networks (DNNs), especially convolutional neural networks, have achieved superior performance on image classification tasks. However, such performance is only guaranteed if the input to a trained model is similar to the training samples, i.e., the input follows the probability distribution of the training set. Out-Of-Distribution (OOD) samples do not follow the distribution of training set, and therefore the predicted class labels on OOD samples become meaningless. Classi\ufb01cation-based methods have been proposed for OOD detection; however, in this study we show that this type of method has no theoretical guarantee and is practically breakable by our OOD Attack algorithm because of dimensionality reduction in the DNN models. We also show that Glow likelihood-based OOD detection is breakable as well. ",
+    "title": "An Algorithm for Out-Of-Distribution Attack to Neural Network Encoder ",
+    "authors": [
+      "Liang Liang",
+      "Linhai Ma",
+      "Linchen Qian",
+      "Jiasong Chen"
+    ],
+    "emails": [
+      "~Liang_Liang2",
+      "miami.edu",
+      "~Linhai_Ma1"
+    ],
+    "rank": 2879
+  },
+  {
+    "url": "https://openreview.net/forum?id=GbCkSfstOIA",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "rating": "3.47",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "We proposed a novel loss function that combines supervised learning with clustering in deep neural networks. Taking advantage of the data distribution and the existence of some labeled data, we construct a meaningful latent space.  Our loss function consists of three parts, the quality of the clustering result, the margin between clusters, and the classification error of labeled instances. Our proposed model is trained to minimize our loss function by backpropagation, avoiding the need for pre-training or additional networks. This guides our network to classify labeled samples correctly while able to find good clusters simultaneously. We applied our proposed method on MNIST, USPS, ETH-80, and COIL-100; the comparison results confirm our model's outstanding performance over semi-supervised learning.",
+    "title": "Semi-Supervised Learning via Clustering Representation Space",
+    "authors": [
+      "Yen-Chieh Huang",
+      "Yuh-Jye Lee",
+      "Chih-Chi Wu",
+      "Yi-Wei Chiu",
+      "Yong-Xiang Lin",
+      "\bCHENG-YING LI",
+      "Po-Hung Ko"
+    ],
+    "emails": [
+      "nctu.edu.tw",
+      "~Yuh-Jye_Lee1",
+      "gmail.com",
+      "~Chih-Chi_Wu1",
+      "~Yi-Wei_Chiu1"
+    ],
+    "rank": 2880
+  },
+  {
+    "url": "https://openreview.net/forum?id=gtwVBChN8td",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      3,
+      3
+    ],
+    "rating": "3.47",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The overestimation problem has long been popular in deep value learning, because function approximation errors may lead to amplified value estimates and suboptimal policies. There have been several methods to deal with the overestimation problem, however, further problems may be induced, for example, the underestimation bias and instability. In this paper, we focus on the overestimation issues on continuous control through deep reinforcement learning, and propose a novel algorithm that can minimize the overestimation, avoid the underestimation bias and retain the policy improvement during the whole training process. Specifically, we add a weight factor to adjust the influence of two independent critics, and use the combined value of weighted critics to update the policy. Then the updated policy is involved in the update of the weight factor, in which we propose a novel method to provide theoretical and experimental guarantee for future policy improvement. We evaluate our method on a set of classical control tasks, and the results show that the proposed algorithms are more computationally efficient and stable than several existing algorithms for continuous control.",
+    "title": "Deep Reinforcement Learning With Adaptive Combined Critics",
+    "authors": [
+      "Huihui Zhang",
+      "Wu Huang"
+    ],
+    "emails": [
+      "~Wu_Huang1",
+      "~Huihui_Zhang1"
+    ],
+    "rank": 2881
+  },
+  {
+    "url": "https://openreview.net/forum?id=3FkrodAXdk",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.47",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Diverse deep ensembles hold the potential for improving accuracy and robustness of deep learning models. Both pairwise and non-pairwise ensemble diversity metrics have been proposed over the past two decades. However, it is also challenging to find the right metrics that can effectively prune those deep ensembles with insufficient ensemble diversity, thus failing to deliver effective ensemble accuracy. In this paper, we first compare six popular diversity metrics in the literature, coined as Q metrics, including both pairwise and non-pairwise representatives. We analyze their inherent limitations in capturing the negative correlation of ensemble member models, and thus inefficient in identifying and pruning low quality ensembles. We next present six HQ ensemble diversity metrics by extending the existing Q-metrics with three novel optimizations: (1) We introduce the concept of focal model and separately measure the ensemble diversity among the deep ensembles of the same team size with the concept of focal model, aiming to better capture the negative correlations of member models of an ensemble. (2) We introduce six HQ-diversity metrics to optimize the corresponding Q-metrics respectively in terms of measuring negative correlation among member models of an ensemble using its ensemble diversity score. (3) We introduce a two phase hierarchical pruning method to effectively identify and prune those deep ensembles with high HQ diversity scores, aiming to increase the lower and upper bounds on ensemble accuracy for the selected ensembles. By combining these three optimizations, deep ensembles selected based on our hierarchical diversity pruning approach significantly outperforms those selected by the corresponding Q-metrics. Comprehensive experimental evaluation over several benchmark datasets shows that our HQ-metrics can effectively select high diversity deep ensembles by pruning out those ensembles with insufficient diversity, and successfully increase the lower bound (worst case) accuracy of the selected deep ensembles, compared to those selected using the state-of-the-art Q-metrics.",
+    "title": "Deep Ensembles with Hierarchical Diversity Pruning",
+    "authors": [
+      "Yanzhao Wu",
+      "Ling Liu"
+    ],
+    "emails": [
+      "~Yanzhao_Wu1",
+      "~Ling_Liu3"
+    ],
+    "rank": 2882
+  },
+  {
+    "url": "https://openreview.net/forum?id=389rLpWoOlG",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.47",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The lack of labeled data is a major problem in both research and industrial settings since obtaining labels is often an expensive and time-consuming activity. In the past years, several machine learning algorithms were developed to assist and perform automated labeling in partially labeled datasets. While many of these algorithms are available in open-source packages, there is no research that investigates how these algorithms compare to each other in different types of datasets and with different percentages of available labels. To address this problem, this paper empirically evaluates and compares seven algorithms for automated labeling in terms of accuracy. We investigate how these algorithms perform in six different and well-known datasets with three different types of data, images, texts, and numerical values. We evaluate these algorithms under two different experimental conditions, with 10\\% and 50\\% labels of available labels in the dataset. Each algorithm, in each dataset for each experimental condition, is evaluated independently ten times with different random seeds. The results are analyzed and the algorithms are compared utilizing a Bayesian Bradley-Terry model. The results indicate that while the algorithms label spreading with K-nearest neighbors perform better in the aggregated results, the active learning algorithms query by instance QBC and query instance uncertainty sample perform better when there is only 10\\% of labels available. These results can help machine learning practitioners in choosing optimal machine learning algorithms to label their data.",
+    "title": "Machine Learning Algorithms for Data Labeling: An Empirical Evaluation",
+    "authors": [
+      "Teodor Anders Fredriksson",
+      "David Issa Mattos",
+      "Jan Bosch",
+      "Helena Holmstr\u00f6m Olsson"
+    ],
+    "emails": [
+      "mau.se",
+      "chalmers.se",
+      "~Teodor_Anders_Fredriksson1"
+    ],
+    "rank": 2883
+  },
+  {
+    "url": "https://openreview.net/forum?id=t3S-7WMOXA-",
+    "ratings": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.47",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Learning multimodal representations is a requirement for many tasks such as image--caption retrieval. Previous work on this problem has only focused on finding good vector representations without any explicit measure of uncertainty. In this work, we argue and demonstrate that learning multimodal representations as probability distributions can lead to better representations, as well as providing other benefits such as adding a measure of uncertainty to the learned representations. We show that this measure of uncertainty can capture how confident our model is about the representations in the multimodal domain, i.e, how clear it is for the model to retrieve/predict the matching pair. We experiment with similarity metrics that have not been traditionally used for the multimodal retrieval task, and show that the choice of the similarity metric affects the quality of the learned representations.",
+    "title": "Probabilistic Multimodal Representation Learning",
+    "authors": [
+      "Leila Pishdad",
+      "Ran Zhang",
+      "Afsaneh Fazly",
+      "Allan Jepson"
+    ],
+    "emails": [
+      "~Leila_Pishdad1",
+      "samsung.com"
+    ],
+    "rank": 2884
+  },
+  {
+    "url": "https://openreview.net/forum?id=otuxSY_QDZ9",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      5,
+      5,
+      3
+    ],
+    "rating": "3.47",
+    "confidences": [
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Classification is a pivotal function for many computer vision tasks such as image recognition, object  detection, scene segmentation. Multinomial logistic regression with a single final layer of dense connections has become the ubiquitous technique for CNN-based classification. While these classifiers project a mapping between the input and a set of output category classes, they do not typically yield  a comprehensive description of the category. In particular, when a CNN based image classifier correctly identifies the image of a Chimpanzee, its output does not clarify that Chimpanzee is a member of Primate, Mammal, Chordate families and a living thing. We propose a multilayer dense connectivity for a CNN to simultaneously predict the category \\emph{and} its conceptual superclasses in hierarchical order. We experimentally demonstrate that our proposed dense connections, in conjunction with popular convolutional feature layers,  can learn to predict the  conceptual classes with minimal increase in network size while maintaining the categorical classification accuracy.",
+    "title": "Multilayer Dense Connections for Hierarchical Concept Classification",
+    "authors": [
+      "Toufiq Parag",
+      "Hongcheng Wang"
+    ],
+    "emails": [
+      "~Hongcheng_Wang4",
+      "~Toufiq_Parag3"
+    ],
+    "rank": 2885
+  },
+  {
+    "url": "https://openreview.net/forum?id=u78MCr9_7Z9",
+    "ratings": [
+      4,
+      6,
+      1
+    ],
+    "rating": "3.46",
+    "confidences": [
+      4,
+      4,
+      5
+    ],
+    "abstract": "Learning object-centric scene representations is crucial for scene structural understanding. However, current unsupervised scene factorization and representation learning models do not reason about scene objects' relations while making an inference. In this paper, we address the issue by introducing a differentiable correlation prior that forces the inference models to suppress duplicate object representations. The extension is evaluated by adding it to three different scene understanding approaches. The results show that the models trained with the proposed method not only outperform the original models in scene factorization and have fewer duplicate representations, but also close the approximation gap between the data evidence and the evidence lower bound.",
+    "title": "Non-maximum Suppression Also Closes the Variational Approximation Gap of Multi-object Variational Autoencoders",
+    "authors": [
+      "Li Nanbo",
+      "Robert Burns Fisher"
+    ],
+    "emails": [
+      "~Robert_Burns_Fisher1",
+      "ed.ac.uk"
+    ],
+    "rank": 2886
+  },
+  {
+    "url": "https://openreview.net/forum?id=Nq5zyAUD65",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      5,
+      4
+    ],
+    "rating": "3.46",
+    "confidences": [
+      5,
+      2,
+      3,
+      3
+    ],
+    "abstract": "Deep networks are gradually penetrating almost every domain in our lives due to their amazing success.  However, with substantive performance accuracy improvements comes the price of irreproducibility.   Two identical models, trained on the exact same training dataset may exhibit large differences in predictions on individual examples even when average accuracy is similar, especially when trained on highly distributed parallel systems.  The popular Rectified Linear Unit (ReLU) activation has been key to recent success of deep networks.  We demonstrate, however, that ReLU is also a catalyzer to irreproducibility in deep networks.  We show that not only can activations smoother than ReLU provide better accuracy, but they can also provide better accuracy-reproducibility tradeoffs.  We propose a new family of activations; Smooth ReLU (SmeLU), designed to give such better tradeoffs, while also keeping the mathematical expression simple, and thus implementation cheap.  SmeLU is monotonic, mimics ReLU, while providing continuous gradients, yielding better reproducibility.  We generalize SmeLU to give even more flexibility and then demonstrate that SmeLU and its generalized form are special cases of a more general methodology of  REctified Smooth Continuous Unit (RESCU) activations. \nEmpirical results demonstrate the superior accuracy-reproducibility tradeoffs with smooth activations, SmeLU in particular.",
+    "title": "Smooth Activations and Reproducibility in Deep Networks",
+    "authors": [
+      "Gil I Shamir",
+      "Dong Lin",
+      "Lorenzo Coviello"
+    ],
+    "emails": [
+      "google.com",
+      "~Lorenzo_Coviello1"
+    ],
+    "rank": 2887
+  },
+  {
+    "url": "https://openreview.net/forum?id=aJLjjpi0Vty",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.44",
+    "confidences": [
+      4,
+      4,
+      3,
+      5
+    ],
+    "abstract": "The problem of predicting the rating of a set of users to a set of items in a recommender system based on partial knowledge of the ratings is widely known as collaborative filtering. In this paper, we consider a mapping of the items into a vector space and study the prediction problem by assuming an underlying smooth preference function for each user, the quantization at each given vector yields the associated rating. To estimate the preference functions, we implicitly cluster the users with similar ratings to form dominant types. Next, we associate each dominant type with a smooth preference function; i.e., the function values for items with nearby vectors shall be close to each other. \nThe latter is accomplished by a rich representation learning in a so-called frequency domain. In this framework, we propose two approaches for learning user and item representations. First, we use an alternating optimization method in the spirit of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container>-means to cluster users and map items. We further make this approach less prone to overfitting by a boosting technique. \nSecond, we present a feedforward neural network architecture consisting of interpretable layers which implicitely clusters the users. The performance of the method is evaluated on two benchmark datasets (ML-100k and ML-1M). Albeit the method benefits from simplicity, it shows a remarkable performance and opens a venue for future research. All codes are publicly available on the GitLab.",
+    "title": "Collaborative Filtering with Smooth Reconstruction of the Preference Function",
+    "authors": [
+      "Ali Shirali",
+      "Reza Kazemi",
+      "Arash Amini"
+    ],
+    "emails": [
+      "~Ali_Shirali1",
+      "~Arash_Amini3",
+      "~Reza_Kazemi1"
+    ],
+    "rank": 2888
+  },
+  {
+    "url": "https://openreview.net/forum?id=NTElq-Fo-F4",
+    "ratings": [
+      3,
+      4,
+      2,
+      5
+    ],
+    "rating": "3.44",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "In this paper, we propose a novel approach for analyzing and evaluating how a deep neural network is autonomously learning different features related to programs on different input representations.\nWe trained a simple autoencoder having 5 hidden layers on a dataset containing Java programs, and we tested the ability of each of its neurons in detecting different program features using only unlabeled data for the training phase. For doing that, we designed two binary classification problems having different scopes: while the first one is based on the program cyclomatic complexity, the other one is defined starting from the identifiers chosen by the programmers, making it more related to the functionality (and thus, to some extent, to the semantic) of the program than to its structure. Using different program vector representations as input, we performed experiments considering the two problems, showing how some neurons can be effectively used as classifiers for programs on different binary tasks. We also discuss how the program representation chosen as input affects the classification performance, stating that new and customized program embeddings could be designed in order to obtain models able to solve different tasks guided by the proposed benchmarking approach.",
+    "title": "Analysing Features Learned Using Unsupervised Models on Program Embeddings",
+    "authors": [
+      "Martina Saletta",
+      "Claudio Ferretti"
+    ],
+    "emails": [
+      "unimib.it",
+      "~Martina_Saletta1"
+    ],
+    "rank": 2889
+  },
+  {
+    "url": "https://openreview.net/forum?id=_-BHVPvT8Wm",
+    "ratings": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.44",
+    "confidences": [
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Variational auto-encoders (VAEs) are an influential and generally-used class of likelihood-based generative models in unsupervised learning. The likelihood-based generative models have been reported to be highly robust to the out-of-distribution (OOD) inputs and can be a detector by assuming that the model assigns higher likelihoods to the samples from the in-distribution (ID) dataset than an OOD dataset. However, recent works reported a phenomenon that VAE recognizes some OOD samples as ID by assigning a higher likelihood to the OOD inputs compared to the one from ID.  In this work, we introduce a new model, namely \\textit{Bigeminal Priors Variational auto-encoder (BPVAE)}, to address this phenomenon. The BPVAE aims to enhance the robustness of the VAEs by combing the power of VAE with the two independent priors that belong to the training dataset and simple dataset, which complexity is lower than the training dataset, respectively. BPVAE learns two datasets\u2019 features, assigning a higher likelihood for the training dataset than the simple dataset. In this way, we can use BPVAE\u2019s density estimate for detecting the OOD samples. Quantitative experimental results suggest that our model has better generalization capability and stronger robustness than the standard VAEs, proving the effectiveness of the proposed approach of hybrid learning by collaborative priors. Overall, this work paves a new avenue to potentially overcome the OOD problem via multiple latent priors modeling.",
+    "title": "Bigeminal Priors Variational Auto-encoder",
+    "authors": [
+      "Xuming Ran",
+      "Mingkun Xu",
+      "Qi Xu",
+      "Huihui Zhou",
+      "Quanying Liu"
+    ],
+    "emails": [
+      "~Mingkun_Xu1",
+      "~Qi_Xu1",
+      "~Huihui_Zhou1",
+      "~Quanying_Liu1",
+      "~Xuming_Ran1"
+    ],
+    "rank": 2890
+  },
+  {
+    "url": "https://openreview.net/forum?id=1fLunL_hDj_",
+    "ratings": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.44",
+    "confidences": [
+      5,
+      2,
+      5,
+      4
+    ],
+    "abstract": "It is well accepted that the choice of token vocabulary largely affects the performance of machine translation.\nOne dominant approach to construct a good vocabulary is the Byte Pair Encoding method (BPE). \nHowever, due to expensive trial costs, most previous studies only conduct simple trials with commonly used vocabulary sizes. \nThis paper finds an exciting relation between an information-theoretic feature and BLEU scores with a given vocabulary. \nWith this observation, we formulate the quest of vocabularization -- finding the best token dictionary with a proper size -- as an optimal transport problem. We then propose Info-VOT, a simple and efficient solution without the full and costly trial training. \nWe evaluate our approach on multiple machine translation tasks, including WMT-14 English-German translation, TED bilingual translation, and TED multilingual translation. Empirical results show that Info-VOT can generate well-performing vocabularies on diverse scenarios.  Also, one advantage of the proposed approach lies in its low consumption of computation resources. On TED bilingual translation, Info-VOT only spends a few CPU hours generating vocabularies, while the traditional BPE-Search solution takes hundreds of GPU hours. ",
+    "title": "Information-theoretic Vocabularization via Optimal Transport for Machine Translation",
+    "authors": [
+      "Jingjing Xu",
+      "Hao Zhou",
+      "Chun Gan",
+      "Zaixiang Zheng",
+      "Lei Li"
+    ],
+    "emails": [
+      "~Jingjing_Xu1",
+      "bytedance.com",
+      "~Lei_Li11",
+      "~Zaixiang_Zheng2"
+    ],
+    "rank": 2891
+  },
+  {
+    "url": "https://openreview.net/forum?id=kxcpPjZm4rx",
+    "ratings": [
+      2,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.44",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Generative Adversarial Networks (GAN) are popular generative models of images. Although researchers proposed variants of GAN for different applications, evaluating and comparing GANs is still challenging as GANs may have many failure cases such as low visual quality and model collapse. To alleviate this issue, we propose a novel framework to evaluate the training stability (S), visual quality (Q), and mode diversity (D) of GAN simultaneously. SQD requires only a moderate number of samples, allowing real-time monitoring of the training dynamics of GAN. We showcase the utility of the SQD framework on prevalent GANs and discovered that the gradient penalty (Gulrajani et al., 2017) regularization significantly improves the performance of GAN. We also compare the gradient penalty regularization with other regularization methods and reveal that enforcing the 1-Lipschitz condition of the discriminator network stabilizes GAN training.",
+    "title": "Measuring GAN Training in Real Time",
+    "authors": [
+      "Yuexiang Zhai",
+      "Bai Jiang",
+      "Yi Ma",
+      "Hao Chen"
+    ],
+    "emails": [
+      "bytedance.com",
+      "~Yi_Ma4",
+      "~Yuexiang_Zhai1",
+      "~Hao_Chen5"
+    ],
+    "rank": 2892
+  },
+  {
+    "url": "https://openreview.net/forum?id=MhTgnultR1K",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.44",
+    "confidences": [
+      4,
+      4,
+      5,
+      3
+    ],
+    "abstract": "Federated learning is a framework for protecting distributed data privacy and has participated in commercial activities. However, there is a lack of a sufficiently reasonable contribution measurement mechanism to distribute the reward for each agent. In the commercial union, if there is no mechanism like this, every agent will get the same reward. This is unfair to agents that provide better data, so such a mechanism is needed. To address this issue, this work proposes a real-time contribution measurement method. Firstly, the method defines the impact of each agent. Furthermore, we comprehensively consider the current round and the previous round to obtain the contribution rate of each agent. To verify effectiveness of the proposed method, the work conducts pseudo-distributed training and an experiment on the Penn Treebank dataset. Comparing the Shapley Value in game theory, the comparative experiment result shows that the proposed method is more sensitive to both data quantity and data quality under the premise of maintaining real-time.",
+    "title": "A Real-time Contribution Measurement Method for Participants in Federated Learning",
+    "authors": [
+      "Bingjie Yan",
+      "Yize Zhou",
+      "Boyi Liu",
+      "Jun Wang",
+      "Yuhan Zhang",
+      "Li Liu",
+      "Xiaolan Nie",
+      "Zhiwei Fan",
+      "Zhixuan Liang"
+    ],
+    "emails": [
+      "~Bingjie_Yan1",
+      "163.com",
+      "gmail.com",
+      "ieee.org",
+      "~Zhixuan_Liang1",
+      "hainanu.edu.cn"
+    ],
+    "rank": 2893
+  },
+  {
+    "url": "https://openreview.net/forum?id=-gabSeMKO4H",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "rating": "3.44",
+    "confidences": [
+      4,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Many studies have proven that Translation Memory (TM) can help improve the translation quality of neural machine translation (NMT). Existing ways either employ extra encoder to encode information from TM or concatenate source sentence and TM sentences as encoder's input. These previous methods don't model the semantic relationship between the source sentence and TM sentences. Meanwhile, the training corpus related to TM is limited, and the sentence level retrieval approach further limits its scale. \nIn this paper, we propose a novel method to combine the strengths of both TM and NMT. We treat the matched sentence pair of TM as the additional signal and apply one encoder enhanced by the pre-trained language model (PLM) to encode the TM information and source sentence together. Additionally, we extend the sentence level retrieval method to the n-gram retrieval method that we don't need to calculate the similarity score. Further, we explore new methods to manipulate the information flow from TM to the NMT decoder. We validate our proposed methods on a mixed test set of multiple domains. Experiment results demonstrate that the proposed methods can significantly improve the translation quality and show strong adaptation for an unknown or new domain.",
+    "title": "Translation Memory Guided Neural Machine Translation",
+    "authors": [
+      "Shaohui Kuang",
+      "Heng Yu",
+      "Weihua Luo",
+      "Qiang Wang"
+    ],
+    "emails": [
+      "~Qiang_Wang8",
+      "~Shaohui_Kuang1",
+      "~Heng_Yu1",
+      "alibaba-inc.com"
+    ],
+    "rank": 2894
+  },
+  {
+    "url": "https://openreview.net/forum?id=8mVSD0ETOXl",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.44",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "Specific molecular recognition by proteins, for example, protease enzymes, is critical for maintaining the robustness of key life processes.  The substrate specificity landscape of a protease enzyme comprises the set of all sequence motifs that are recognized/cut, or just as importantly, not recognized/cut by the enzyme. Current methods for predicting protease specificity landscapes rely on learning sequence patterns in experimentally derived data with a single enzyme, but are not robust to even small mutational changes. A comprehensive evaluation of specificity requires consideration of the three-dimensional structure and energetics of molecular interactions. In this work, we present a protein graph convolutional neural network (PGCN), which uses a physically intuitive, structure-based molecular interaction graph generated using the Rosetta energy function that describes the topology and energetic features, to determine substrate specificity. We use the PGCN to recapitulate and predict the specificity of the NS3/4 protease from the Hepatitic C virus. We compare our PGCN with previously used machine learning models and show that its performance in classification tasks is equivalent or better. Because PGCN is based on physical interactions, it is inherently more interpretable; determination of feature importance reveals key sub-graph patterns responsible for molecular recognition that are biochemically reasonable. The PGCN model also readily lends itself to the design of novel enzymes with tailored specificity against disease targets.",
+    "title": "Prediction of Enzyme Specificity using Protein Graph Convolutional Neural Networks",
+    "authors": [
+      "Changpeng Lu",
+      "Samuel Z Stentz",
+      "Joseph H Lubin",
+      "Sijian Wang",
+      "Sagar D Khare"
+    ],
+    "emails": [
+      "gatech.edu",
+      "~Sagar_D_Khare1",
+      "scarletmail.rutgers.edu",
+      "~Changpeng_Lu1",
+      "stat.rutgers.edu"
+    ],
+    "rank": 2895
+  },
+  {
+    "url": "https://openreview.net/forum?id=LFs3CnHwfM",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      5,
+      3
+    ],
+    "rating": "3.42",
+    "confidences": [
+      3,
+      2,
+      3,
+      4
+    ],
+    "abstract": "This paper deals with the fuel optimization problem for hybrid electric vehicles in reinforcement learning framework. Firstly, considering the hybrid electric vehicle as a completely observable non-linear system with uncertain dynamics, we solve an open-loop  deterministic optimization problem. This is followed by the design of a  deep reinforcement learning based optimal controller for the non-linear system using concurrent learning based system identifier such that the actual states and the control policy are able to track the optimal trajectory and optimal policy, autonomously even in the presence of external disturbances, modeling errors, uncertainties and noise and signigicantly reducing the computational complexity at the same time, which is in sharp contrast to the conventional methods like PID and Model Predictive Control (MPC) as well as traditional RL approaches like ADP, DDP and DQN that mostly depend on a set of pre-defined rules and provide sub-optimal solutions under similar conditions. The low value of the H-infinity (<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D43B TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c221E\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>H</mi><mrow><mi mathvariant=\"normal\">\u221e</mi></mrow></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> performance index of the proposed optimization algorithm addresses the robustness issue. The optimization technique thus proposed is compared with the traditional fuel optimization strategies for hybrid electric vehicles to illustate the efficacy of the proposed method.",
+    "title": "A Robust Fuel Optimization Strategy For Hybrid Electric Vehicles: A Deep Reinforcement Learning Based Continuous Time Design Approach",
+    "authors": [
+      "Nilanjan Mukherjee",
+      "Sudeshna Sarkar"
+    ],
+    "emails": [
+      "~Nilanjan_Mukherjee1",
+      "~Sudeshna_Sarkar1"
+    ],
+    "rank": 2896
+  },
+  {
+    "url": "https://openreview.net/forum?id=whySRc6f5g_",
+    "ratings": [
+      3,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.41",
+    "confidences": [
+      5,
+      4,
+      3,
+      5
+    ],
+    "abstract": "Recently, through numerous works Esteban et al. (2017); Mogren (2016); Yoon et al. (2019), attempts were made to obtain generative models for time series that correctly reproduce the underlying temporal characteristics of a given training dataset. However, we prove in this work that the performance of these models is limited on datasets with high-variability for example containing different classes. In such setups, it is extremely difficult for a generative model to find the right trade-off between sample fidelity i.e. their similarity to the real time series and sample diversity. Furthermore, it is essential to preserve the original classes and the variation within each class. To tackle this issue, we propose a new generative class sensitive model, Class-specific Recurrent GAN (CLaRe-GAN), that conditions the generator on an auxiliary information containing the class-specific and class-independent attributes. Our model relies on class specific encoders: a unique encoder for two contradictory functionalities i.e. extracting the inter- and intra-class attributes. To extract the high-level representation of the time series, we make a shared-latent space assumption Liu et al. (2017). At the same time, we use a class discriminator that discriminates between the latent vectors to efficiently extract the class-specific attributes. We test our approach on a set of publicly available datasets where the number of classes, the length and the number of available times series for each class varies and evaluate our approach both visually and computationally. We prove that our model outperforms the state-of-the-art generative models and leads to a significant and consistent improvement in the quality of the generated time series while at the same time preserving the classes and the variation of the original dataset.",
+    "title": "CLARE-GAN: GENERATION OF CLASS-SPECIFIC TIME SERIES",
+    "authors": [
+      "Hiba Arnout",
+      "Johanna Bronner",
+      "Thomas Runkler"
+    ],
+    "emails": [
+      "~Hiba_Arnout1",
+      "siemens.com",
+      "~Thomas_Runkler1"
+    ],
+    "rank": 2897
+  },
+  {
+    "url": "https://openreview.net/forum?id=TLfjwEFI527",
+    "ratings": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.41",
+    "confidences": [
+      3,
+      4,
+      5,
+      5
+    ],
+    "abstract": "An efficient representation of a hierarchical structure is essential for developing intelligent systems because most real-world objects are arranged in hierarchies. A distributional representation has brought great success in numerous natural language processing tasks, and a hierarchy is also successfully represented with embeddings. Particularly, the latest approaches such as hyperbolic embeddings showed significant performance by representing essential meanings in a hierarchy (generality and similarity of objects) with spatial properties (distance from the origin and difference of angles). To achieve such an effective connection in commonly used Euclidean space, we propose Polar Embedding that learns representations with the polar coordinate system. In polar coordinates, an object is expressed with two independent variables: radius and angles, which allows us to separately optimize their values on the explicit correspondence of generality and similarity of objects in a hierarchy. Also, we introduce an optimization method combining a loss function controlling gradient and iterative uniformization of distributions.  We overcome the issue resulting from the characteristics that the conventional squared loss function makes distance apart as much as possible for irrelevant pairs in spite of the fact the angle ranges are limited. Our results on a standard link-prediction task indicate that polar embedding outperforms other embeddings in low-dimensional Euclidean space and competitively performs even with hyperbolic embeddings, which possess a geometric advantage.",
+    "title": "Polar Embedding",
+    "authors": [
+      "Ran Iwamoto",
+      "Ryosuke Kohita",
+      "Akifumi Wachi"
+    ],
+    "emails": [
+      "~Ryosuke_Kohita1",
+      "~Ran_Iwamoto1",
+      "~Akifumi_Wachi2"
+    ],
+    "rank": 2898
+  },
+  {
+    "url": "https://openreview.net/forum?id=mgzp2bTOzPW",
+    "ratings": [
+      3,
+      3,
+      5,
+      3
+    ],
+    "rating": "3.40",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Semantic understanding is an important factor affecting the quality of machine translation of low resource agglutination language. The common methods(sub-word modeling, pre-training word embedding, etc.) will increase the length of the sequence, which leads to a surge in computation. At the same time, the pre-training word embedding with rich context is also the precondition to improve the semantic understanding. Although BERT uses masked language model to generate dynamic embedding in parallel, however, the fine-tuning strategy without mask will make it inconsistent with the training data, which produced human error. Therefore, we proposed a word embedding generation method based on improved XLNet, it corrects the defects of the BERT model, and improves the sampling redundancy issues in traditional XLNet. Experiments are carried out on CCMT2019 Mongolian-Chinese, Uyghur-Chinese and Tibetan-Chinese tasks, the results show that the generalization ability and BLEU scores of our method are improved compared with the baseline, which fully verifies the effectiveness of the method.",
+    "title": "Syntactic Relevance XLNet Word Embedding Generation in Low-Resource Machine Translation",
+    "authors": [
+      "Nier Wu",
+      "Yatu Ji",
+      "Hongxu Hou"
+    ],
+    "emails": [
+      "~Nier_Wu1",
+      "~Hongxu_Hou1",
+      "~Yatu_Ji1"
+    ],
+    "rank": 2899
+  },
+  {
+    "url": "https://openreview.net/forum?id=sxZvLS2ZPfH",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      5,
+      2,
+      3
+    ],
+    "rating": "3.38",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Despite the development of pre-trained language models (PLMs) significantly raise the performances of various Chinese natural language processing (NLP) tasks, the vocabulary for these Chinese PLMs remain to be the one provided by Google Chinese Bert, which is based on Chinese characters. Second, the masked language model pre-training is based on a single vocabulary, which limits its downstream task performances. In this work, we first propose a novel method, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c5F\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>s</mi><mi>e</mi><mi>g</mi><mi mathvariant=\"normal\">_</mi><mi>t</mi><mi>o</mi><mi>k</mi></math></mjx-assistive-mml></mjx-container>, to form the vocabulary of Chinese BERT, with the help of Chinese word segmentation (CWS) and subword tokenization. Then we propose three versions of multi-vocabulary pretraining (MVP) to improve the models expressiveness.   Experiments show that: (a) compared with char based vocabulary, <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c5F\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>s</mi><mi>e</mi><mi>g</mi><mi mathvariant=\"normal\">_</mi><mi>t</mi><mi>o</mi><mi>k</mi></math></mjx-assistive-mml></mjx-container> does not only improves the performances of Chinese PLMs on sentence level tasks, it can also improve efficiency; (b) MVP improves PLMs' downstream performance, especially it can improve <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D454 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c5F\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45C TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>s</mi><mi>e</mi><mi>g</mi><mi mathvariant=\"normal\">_</mi><mi>t</mi><mi>o</mi><mi>k</mi></math></mjx-assistive-mml></mjx-container>'s performances on sequence labeling tasks. ",
+    "title": "MVP-BERT: Redesigning Vocabularies for Chinese BERT and Multi-Vocab Pretraining",
+    "authors": [
+      "Wei Zhu"
+    ],
+    "emails": [
+      "~Wei_Zhu5"
+    ],
+    "rank": 2900
+  },
+  {
+    "url": "https://openreview.net/forum?id=h9XgC7JzyHZ",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2,
+      4
+    ],
+    "rating": "3.38",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Optimal transport distances (OT) have been widely used in recent work in Machine Learning as ways to compare probability distributions. These are costly to compute when the data lives in high dimension.\nRecent work aims specifically at reducing this cost by computing OT using low-rank projections of the data (seen as discrete measures)~\\citep{paty2019subspace}. We extend this approach and show that one can approximate OT distances by using more general families of maps provided they are 1-Lipschitz. The best estimate is obtained by maximising OT over the given family. As OT calculations are done after mapping data to a lower dimensional space, our method scales well with the original data dimension.\nWe demonstrate the idea with neural networks. ",
+    "title": "Efficient estimates of optimal transport via low-dimensional embeddings",
+    "authors": [
+      "Patric Fulop",
+      "Vincent Danos"
+    ],
+    "emails": [
+      "~Patric_Fulop1",
+      "~Vincent_Danos1"
+    ],
+    "rank": 2901
+  },
+  {
+    "url": "https://openreview.net/forum?id=lpwg-UOkAl7",
+    "ratings": [
+      3,
+      3,
+      4
+    ],
+    "rating": "3.36",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "Anomaly detection (AD) on hyperspectral images (HSIs) is of great importance in both space exploration and earth observations. However, the challenges caused by insufficient datasets, no labels, and noise corruption substantially downgrade the quality of detection. For solving these problems, this paper proposes a sparse coding-inspired generative adversarial network (GAN) for weakly supervised HAD, named sparseHAD. It can learn a discriminative latent reconstruction with small errors for background samples and large errors for anomaly samples.  First, we design a novel background-category searching step to eliminate the difficulty of data annotation and prepare for weakly supervised learning. Then, a sparse coding-inspired regularized network is integrated into an end-to-end GAN to form a weakly supervised spectral mapping model consisting of two encoders, a decoder, and a discriminator. This model not only makes the network more robust and interpretable both experimentally and theoretically but also develops a new sparse coding-inspired path for HAD. Subsequently, the proposed sparseHAD detect anomalies in latent space rather than original space, which also contributes to the robustness of the network against noise. Quantitative assessments and experiments over real HSIs demonstrate the unique promise of such an approach.",
+    "title": "Sparse Coding-inspired GAN for Weakly Supervised Hyperspectral Anomaly Detection",
+    "authors": [
+      "Tao Jiang",
+      "Weiying Xie",
+      "Jie Lei",
+      "Yunsong Li",
+      "Zan Li"
+    ],
+    "emails": [
+      "~Yunsong_Li1",
+      "stu.xidian.edu.cn",
+      "xidian.edu.cn",
+      "mail.xidian.edu.cn",
+      "~Weiying_Xie1"
+    ],
+    "rank": 2902
+  },
+  {
+    "url": "https://openreview.net/forum?id=EeeOTYhLlVm",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3
+    ],
+    "rating": "3.36",
+    "confidences": [
+      4,
+      4,
+      3
+    ],
+    "abstract": "Epidemiologists model the dynamics of epidemics in order to propose control strategies based on pharmaceutical and non-pharmaceutical interventions (contact limitation, lock down, vaccination, etc). Hand-designing such strategies is not trivial because of the number of possible interventions and the difficulty to predict long-term effects. This task can be cast as an optimization problem where state-of-the-art machine learning algorithms such as deep reinforcement learning might bring significant value. However, the specificity of each domain - epidemic modelling or solving optimization problems - requires strong collaborations between researchers from different fields of expertise.\nThis is why we introduce EpidemiOptim, a Python toolbox that facilitates collaborations between researchers in epidemiology and optimization. EpidemiOptim turns epidemiological models and cost functions into optimization problems via a standard interface commonly used by optimization practitioners (OpenAI Gym). Reinforcement learning algorithms based on Q-Learning with deep neural networks (DQN) and evolutionary algorithms (NSGA-II) are already implemented. We illustrate the use of EpidemiOptim to find optimal policies for dynamical on-off lock-down control under the optimization of death toll and economic recess using a Susceptible-Exposed-Infectious-Removed (SEIR) model for SARS-CoV-2/COVID-19. \nUsing EpidemiOptim and its interactive visualization platform in Jupyter notebooks, epidemiologists, optimization practitioners and others (e.g. economists) can easily compare epidemiological models, costs functions and optimization algorithms to address important choices to be made by health decision-makers.",
+    "title": "EpidemiOptim: A Toolbox for the Optimization of Control Policies in Epidemiological Models",
+    "authors": [
+      "C\u00e9dric Colas",
+      "Boris Hejblum",
+      "S\u00e9bastien Rouillon",
+      "Rodolphe Thiebaut",
+      "Pierre-Yves Oudeyer",
+      "Cl\u00e9ment Moulin-Frier",
+      "M\u00e9lanie Prague"
+    ],
+    "emails": [
+      "inria.fr",
+      "~Pierre-Yves_Oudeyer1",
+      "~Cl\u00e9ment_Moulin-Frier2",
+      "u-bordeaux.fr",
+      "~C\u00e9dric_Colas1"
+    ],
+    "rank": 2903
+  },
+  {
+    "url": "https://openreview.net/forum?id=bLGZW0hIQpO",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      2,
+      3
+    ],
+    "rating": "3.36",
+    "confidences": [
+      3,
+      5,
+      3
+    ],
+    "abstract": "Counterfactual regret minimization (CFR) methods are effective for solving two player zero-sum extensive games with imperfect information.  Regret matching (RM) plays a crucial role in CFR and its variants to approach Nash equilibrium. In this paper, we present Temperature Regret Matching (TRM), a novel RM algorithm that adopts a different strategy. Also, we consider not only the opponent's strategy under the current strategy but also the opponent's strategies of the several last iterations for updating the external regret of each iteration. Furthermore, we theoretically demonstrate that the update of TRM converges to Nash Equilibrium.  Competitive results in imperfect-information games have verified its effectiveness and efficiency.",
+    "title": "Temperature Regret Matching for Imperfect-Information Games",
+    "authors": [
+      "Enmin Zhao",
+      "Kai Li",
+      "Junliang Xing"
+    ],
+    "emails": [
+      "ia.ac.cn",
+      "~Kai_Li2",
+      "~Junliang_Xing1"
+    ],
+    "rank": 2904
+  },
+  {
+    "url": "https://openreview.net/forum?id=NqJw2sVJbC8",
+    "ratings": [
+      4,
+      3,
+      3
+    ],
+    "rating": "3.33",
+    "confidences": [
+      4,
+      4,
+      4
+    ],
+    "abstract": "Algorithmic trading systems are often completely automated, and deep learning is increasingly receiving attention in this domain.  Nonetheless, little is known about the robustness properties of these models. We study valuation models for algorithmic trading from the perspective of adversarial machine learning.  We introduce new attacks specific to this domain with size constraints that minimize attack costs. We further discuss how these attacks can be used as an analysis tool to study and evaluate the robustness properties of financial models.  Finally, we investigate the feasibility of realistic adversarial attacks in which an adversarial trader fools automated trading systems into making inaccurate predictions.",
+    "title": "Adversarial Attacks on Machine Learning Systems for High-Frequency Trading",
+    "authors": [
+      "Micah Goldblum",
+      "Avi Schwarzschild",
+      "Ankit Patel",
+      "Tom Goldstein"
+    ],
+    "emails": [
+      "~Tom_Goldstein1",
+      "~Micah_Goldblum1",
+      "~Avi_Schwarzschild1",
+      "~Ankit_Patel1"
+    ],
+    "rank": 2905
+  },
+  {
+    "url": "https://openreview.net/forum?id=yN5kwvn4E1R",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      2,
+      4,
+      3
+    ],
+    "rating": "3.32",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "As a powerful representation learning method on graph data, graph neural networks (GNNs) have shown great popularity in tackling graph analytic problems. Although many attempts have been made in literatures to find strategies about extracting better embedding of the target nodes, few of them consider this issue from a comprehensive perspective. Most of current GNNs usually employ some single method which can commendably extract a certain kind of feature but some equally important features are often ignored. In this paper, we develop a novel dual graph complementary network (DGCN) to learn representation complementarily. We use two different branches, and inputs of the two branches are the same, which are composed of structure and feature information. At the same time, there is also a complementary relationship between the two branches. Beyond that, our extensive experiments show that DGCN outperforms state-of-the-art methods on five public benchmark datasets.",
+    "title": "Dual Graph Complementary Network",
+    "authors": [
+      "Chenhua Liu",
+      "Kun Zhan"
+    ],
+    "emails": [
+      "~Kun_Zhan1",
+      "lzu.edu.cn"
+    ],
+    "rank": 2906
+  },
+  {
+    "url": "https://openreview.net/forum?id=zKw2t3haCSX",
+    "ratings": [
+      5,
+      3,
+      2,
+      4
+    ],
+    "rating": "3.31",
+    "confidences": [
+      4,
+      5,
+      5,
+      2
+    ],
+    "abstract": "In recent years, neural networks have demonstrated their ability to learn previously intractable problems, particularly in the field of computer vision. Although classification accuracy continues to increase for the most challenging benchmark datasets, model efficacy evaluations typically focus on raw accuracy results without a consideration for the types of errors being made. Further, most networks are trained such that classes are treated as separate, unrelated categories without any notion of higher order semantic relationships. This work shows a simple approach for embedding semantic relationships into learned representations via category-aware label smoothing. Using MNIST experiments, we demonstrate that preferable error profiles can be enforced this way and that underparameterized networks without the capacity to achieve reasonable raw accuracy are still capable of learning an effective model when relative error cost is taken into consideration. Additionally, we embed hierarchical information into cifar10 class labels to show that it is possible to enforce arbitrary semantic hierarchies using this method. Further, we use a new method for analyzing class hierarchy in hidden representations, Neurodynamical Agglomerative Analyisis (NAA), to show that latent class relationships in this analysis model tend toward the relationships of the label vectors as the data is projected deeper into the network. ",
+    "title": "Embedding semantic relationships in hidden representations via label smoothing",
+    "authors": [
+      "Michael Marino",
+      "Pascal Nieters",
+      "Gunther Heidemann",
+      "Joachim Hertzberg"
+    ],
+    "emails": [
+      "~Joachim_Hertzberg1",
+      "~Michael_Marino1",
+      "uos.de"
+    ],
+    "rank": 2907
+  },
+  {
+    "url": "https://openreview.net/forum?id=ANednkwrr8s",
+    "ratings": [
+      3,
+      4,
+      4,
+      2
+    ],
+    "rating": "3.29",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Vulnerability of state-of-the-art computer vision models to image perturbations has drawn considerable attention recently. Often these perturbations are imperceptible to humans because they target the perception of deep neural networks (DNNs) employed in the corresponding computer vision task. Recent studies have revealed that DNNs, which are unable to handle targeted perturbation often fail to handle untargeted perturbations as well such as Gaussian noise. Various techniques in past have been explored to mitigate both these types of perturbations ranging from classical preprocessing to current supervised and self-supervised deep discriminative and generative models. However, a common challenge with most of these techniques is that they approach the problem from a quality enhancement point of view, which is primarily driven by human perception. In addition, the supervised models require a large volume of gold standard unperturbed data, whereas others fail to take into account the feedback of the targeted downstream DNN. We propose to model this problem in indirect supervision framework, where we assume that the gold standard data is missing, however, a variable dependent on it is available and the dependency of the observed variable is stated by the considered downstream DNN. The proposed method maintains the advantages of supervised models while relaxing the requirement of gold standard unperturbed data. To prove its utility, we conduct several experiments on various network architectures for downstream tasks of classification and medical image segmentation. We used MNIST, CIFAR-10-C and ISIC skin lesion dataset in our experiments. In all the experiments, a considerable restoration in the performance of the considered downstream model is observed.",
+    "title": "Indirect Supervision to Mitigate Perturbations",
+    "authors": [
+      "Mayank Kumar Kundalwal",
+      "Azad Singh",
+      "Deepak Mishra"
+    ],
+    "emails": [
+      "~Mayank_Kumar_Kundalwal1",
+      "iitj.ac.in",
+      "~Deepak_Mishra5"
+    ],
+    "rank": 2908
+  },
+  {
+    "url": "https://openreview.net/forum?id=IOqr2ZyXHz1",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "rating": "3.29",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "The era of real world evidence has witnessed an increasing availability of observational data, which much facilitates the development of causal effect inference. Although significant advances have been made to overcome the challenges in causal effect estimation, such as missing counterfactual outcomes and selection bias, they only focus on source-specific and stationary observational data. In this paper, we investigate a new research problem of causal effect inference from incrementally available observational data, and present three new evaluation criteria accordingly, including extensibility, adaptability, and accessibility. We propose a Continual Causal Effect Representation Learning method for estimating causal effect with observational data, which are incrementally available from non-stationary data distributions. Instead of having access to all seen observational data, our method only stores a limited subset of feature representations learned from previous data. Combining the selective and balanced representation learning, feature representation distillation, and feature transformation, our method achieves the continual causal effect estimation for new data without compromising the estimation capability for original data. Extensive experiments demonstrate the significance of continual causal effect inference and the effectiveness of our method.",
+    "title": "Continual Lifelong Causal Effect Inference with Real World Evidence",
+    "authors": [
+      "Zhixuan Chu",
+      "Stephen Rathbun",
+      "Sheng Li"
+    ],
+    "emails": [
+      "~Zhixuan_Chu1",
+      "~Sheng_Li3",
+      "uga.edu"
+    ],
+    "rank": 2909
+  },
+  {
+    "url": "https://openreview.net/forum?id=ghjxvfgv9ht",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      2,
+      4
+    ],
+    "rating": "3.29",
+    "confidences": [
+      5,
+      5,
+      4
+    ],
+    "abstract": "  Deep learning tasks with small datasets are often tackled by pretraining models with large datasets on relevent tasks. Although pretraining methods mitigate the problem of overfitting, it can be difficult to find appropriate pretrained models sometimes. In this paper, we proposed a self-pretraininng method by exploiting patch information in the dataset itself without pretraining on other datasets. Our experiments show that the self-pretraining method leads to better performance than training from scratch both in the condition of not using other data.",
+    "title": "Self-Pretraining for Small Datasets by Exploiting Patch Information",
+    "authors": [
+      "Zhang Chunyang"
+    ],
+    "emails": [
+      "~Zhang_Chunyang1"
+    ],
+    "rank": 2910
+  },
+  {
+    "url": "https://openreview.net/forum?id=bFnn6lPn3Sp",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      3
+    ],
+    "rating": "3.29",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Cross-modal associations between a person's voice and face can be learned algorithmically, and this is a useful functionality in many audio and visual applications. The problem can be defined as two tasks: voice-face matching and retrieval. Recently, this topic has attracted much research attention, but it is still in its early stages of development, and evaluation protocols and test schemes need to be more standardized. Performance metrics for different subtasks are also scarce, and a benchmark for this problem needs to be established. In this paper, a baseline evaluation framework is proposed for voice-face matching and retrieval tasks. Test confidence is analyzed, and a confidence interval for estimated accuracy is proposed. Various state-of-the-art performances with high test confidence are achieved on a series of subtasks using the baseline method  (called TriNet) included in this framework. The source code will be published along with the paper. The results of this study can provide a basis for future research on voice-face cross-modal learning.",
+    "title": "A Benchmark for Voice-Face Cross-Modal Matching and  Retrieval",
+    "authors": [
+      "Chuyuan Xiong",
+      "Deyuan Zhang",
+      "Tao Liu",
+      "Xiaoyong Du",
+      "Jiankun Tian",
+      "Songyan Xue"
+    ],
+    "emails": [
+      "~Deyuan_Zhang1",
+      "~Jiankun_Tian1",
+      "~Chuyuan_Xiong1",
+      "~Tao_Liu1",
+      "~Songyan_Xue1",
+      "~Xiaoyong_Du1"
+    ],
+    "rank": 2911
+  },
+  {
+    "url": "https://openreview.net/forum?id=6IVdytR2W90",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.28",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Feature fusion is beneficial to object detection tasks in two folds. On one hand, detail and position information can be combined with semantic information when high and low-resolution features from shallow and deep layers are fused. On the other hand, objects can be detected in different scales, which improves the robustness of the framework. In this work, we present a Multi-Scale Fusion Module (MSFM) that extracts both detail and semantical information from a single input but at different scales within the same layer. Specifically, the input of the module will be resized into different scales on which position and semantic information will be processed, and then they will be rescaled back and combined with the module input. The MSFM is lightweight and can be used as a drop-in layer to many existing object detection frameworks. Experiments show that MSFM can bring +2.5% mAP improvement with only 2.4M extra parameters on Faster R-CNN with ResNet-50 FPN backbone on COCO Object Detection minival set, outperforming that with ResNet-101 FPN backbone without the module which obtains +2.0% mAP with 19.0M extra parameters. The best resulting model achieves a 45.7% mAP on test-dev set. Code will be available.",
+    "title": "MSFM: Multi-Scale Fusion Module for Object Detection",
+    "authors": [
+      "Xuesong Wang",
+      "Caisheng Wang"
+    ],
+    "emails": [
+      "~Caisheng_Wang1",
+      "~Xuesong_Wang2"
+    ],
+    "rank": 2912
+  },
+  {
+    "url": "https://openreview.net/forum?id=HPGtPvFNROh",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2
+    ],
+    "rating": "3.27",
+    "confidences": [
+      3,
+      4,
+      4
+    ],
+    "abstract": "The ongoing digitization of health records within the healthcare industry results in large-scale datasets. Manually extracting clinically-useful insight from such datasets is non-trivial. However, doing so at scale while simultaneously leveraging patient-specific attributes such as sex and age can assist with clinical-trial enrollment, medical school educational endeavours, and the evaluation of the fairness of neural networks. To facilitate the reliable extraction of clinical information, we propose to learn embeddings, known as clinical prototypes (CPs), via supervised contrastive learning. We show that CPs can be efficiently used for large-scale retrieval and clustering of physiological signals based on multiple patient attributes. We also show that CPs capture attribute-specific semantic relationships.",
+    "title": "DROPS: Deep Retrieval of Physiological Signals via Attribute-specific Clinical Prototypes",
+    "authors": [
+      "Dani Kiyasseh",
+      "Tingting Zhu",
+      "David A. Clifton"
+    ],
+    "emails": [
+      "~Dani_Kiyasseh1",
+      "~David_A._Clifton1",
+      "eng.ox.ac.uk"
+    ],
+    "rank": 2913
+  },
+  {
+    "url": "https://openreview.net/forum?id=LgqmtA-wzXi",
+    "ratings": [
+      5,
+      3,
+      4,
+      2
+    ],
+    "rating": "3.27",
+    "confidences": [
+      3,
+      4,
+      3,
+      5
+    ],
+    "abstract": "We consider a Multi-Armed Bandit problem in which the rewards are non-stationary and are dependent on past actions and potentially on past contexts. At the heart of our method, we employ a recurrent neural network, which models these sequences. \nIn order to balance between exploration and exploitation, we present an energy minimization term that prevents the neural network from becoming too confident in support of a certain action. This term provably limits the gap between the maximal and minimal probabilities assigned by the network. In a diverse set of experiments, we demonstrate that our method is at least as effective as methods suggested to solve the sub-problem of Rotting Bandits, can solve intuitive extensions of various benchmark problems, and is effective in a real-world recommendation system scenario.",
+    "title": "Solving Non-Stationary Bandit Problems with an RNN and an Energy Minimization Loss",
+    "authors": [
+      "Michael Rotman",
+      "Lior Wolf"
+    ],
+    "emails": [
+      "~Michael_Rotman1",
+      "~Lior_Wolf1"
+    ],
+    "rank": 2914
+  },
+  {
+    "url": "https://openreview.net/forum?id=sTcyRPRQ2VT",
+    "ratings": [
+      3,
+      4,
+      3
+    ],
+    "rating": "3.27",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Domain-specific Knowledge Graph (KG) generation is a labor intensive task usually orchestrated and supervised by subject matter experts. Herein, we propose a strategy to automate the generation process following a two-step approach. Initially, the structure of the domain of interest is inferred from the corpus in the form of a metagraph. Afterwards, once the domain structure has been discovered, named entity recognition (NER) and relation extraction (RE) models can be used to generate a domain-specific KG.  We argue why the automated definition of the domain's structure as a first step is beneficial both in terms of  construction time and quality of the generated graph. Furthermore, we present a machine learning approach, based on Transformers, to infer the structure of a corpus's domain. The proposed method is extensively validated on three public datasets (WebNLG, NYT and DocRED) by comparing it with two reference methods using CNNs and RNNs. Lastly, we demonstrate how this work lays the foundation for fully automated and unsupervised KG generation.",
+    "title": "An Automated Domain Understanding Technique for Knowledge Graph Generation",
+    "authors": [
+      "Dimitrios Christofidellis",
+      "Matteo Manica",
+      "Leonidas Georgopoulos",
+      "Hans Vandierendonck"
+    ],
+    "emails": [
+      "qub.ac.uk",
+      "~Dimitrios_Christofidellis1",
+      "~Matteo_Manica1",
+      "zurich.ibm.com"
+    ],
+    "rank": 2915
+  },
+  {
+    "url": "https://openreview.net/forum?id=tY38nwwdCDa",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      5,
+      2,
+      3
+    ],
+    "rating": "3.26",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Today, many of the machine learning models are extremely data hungry. On the other hand, the accuracy of the algorithms used is very often affected by the amount of the training data available, which is, unfortunately, rarely abundant. Fortunately, image augmentation is one of the very powerful techniques that can be used by computer-vision engineers to expand their existing image data sets. This paper presents an innovative way for creating a variation of existing images and introduces the idea of using an Object-Focused Image (OFI). This is when an image includes only the labeled object and everything else is made transparent. The objective of OFI method is to expand the existing image data set and hence improve the accuracy of the model used to classify images. This paper also elaborates on the OFI approach and compares the accuracy of five different models with the same network design and settings but with different content of the training data set. The experiments presented in this paper show that using OFIs along with the original images can lead to an increase in the validation accuracy of the used model. In fact, when the OFI technique is used, the number of the images supplied nearly doubles.",
+    "title": "USING OBJECT-FOCUSED IMAGES AS AN IMAGE AUGMENTATION TECHNIQUE TO IMPROVE THE ACCURACY OF IMAGE-CLASSIFICATION MODELS WHEN VERY LIMITED DATA SETS ARE AVAILABLE",
+    "authors": [
+      "Ahmad Melhem Hammoud",
+      "Ahmad Rabih Ghandour"
+    ],
+    "emails": [
+      "mail.aub.edu",
+      "~Ahmad_Melhem_Hammoud1"
+    ],
+    "rank": 2916
+  },
+  {
+    "url": "https://openreview.net/forum?id=BZeewPpFBI6",
+    "ratings": [
+      5,
+      3,
+      2,
+      3
+    ],
+    "rating": "3.25",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Deep neural networks obtain remarkable achievements in diverse real-world applications. However, their success relies on the availability of large amounts of labeled data. A trained model may fail to generalize well on a domain whose distribution differs from the training data distribution. Collecting abundant labeled data for all domains of interest are expensive and time-consuming, sometimes even impossible. Domain adaptation sets out to address this problem, aiming to leverage labeled data in the source domain to learn a good predictive model for the target domain whose labels are scarce or unavailable. A mainstream approach is adversarial domain adaptation, which learns domain invariant-features by performing alignment across different distributions. Most domain adaptation methods focus on reducing the divergence between two domains to make the improvement. A prerequisite of domain adaptation is the adaptability, which is measured by the expected error of the ideal joint hypothesis on the source and target domains, should be kept at a small value in the process of domain alignment. However, adversarial learning may degrade the adaptability, since it distorts the original distributions by suppressing the domain-specific information. In this paper, we propose an approach, which focuses on strengthening the model's adaptability, for domain adaptation. Our proposed dual adversarial training (DAT) method introduces class-invariant features to enhance the discriminability of the latent space without sacrificing the transferability. The class-invariant features, extracted from the source domain, can play a positive role in the classification on the target domain. We demonstrate the effectiveness of our method by yielding state-of-the-art results on several benchmarks.",
+    "title": "Dual Adversarial Training for Unsupervised Domain Adaptation",
+    "authors": [
+      "Yuan Wu",
+      "Diana Inkpen",
+      "Ahmed El-Roby"
+    ],
+    "emails": [
+      "~Diana_Inkpen1",
+      "~Yuan_Wu2",
+      "~Ahmed_El-Roby1"
+    ],
+    "rank": 2917
+  },
+  {
+    "url": "https://openreview.net/forum?id=u9ax42K7ND",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.25",
+    "confidences": [
+      3,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Deep reinforcement learning algorithms aim to achieve human-level intelligence by solving practical decisions-making problems, which are often composed of multiple sub-tasks. Complex and subtle relationships between sub-tasks make traditional methods hard to give a promising solution. We implement a first-person shooting environment with random spatial structures to illustrate a typical representative of this kind. A desirable agent should be capable of balancing between different sub-tasks: navigation to find enemies and shooting to kill them. To address the problem brought by the environment, we propose a Meta Soft Hierarchical reinforcement learning framework (MeSH), in which each low-level sub-policy focuses on a specific sub-task respectively and high-level policy automatically learns to utilize low-level sub-policies through meta-gradients. The proposed framework is able to disentangle multiple sub-tasks and discover proper low-level policies under different situations. The effectiveness and efficiency of the framework are shown by a series of comparison experiments. Both environment and algorithm code will be provided for open source to encourage further research.",
+    "title": "Hierarchical Meta Reinforcement Learning for Multi-Task Environments",
+    "authors": [
+      "Dongyang Zhao",
+      "Yue Huang",
+      "Changnan Xiao",
+      "Yue Li",
+      "Shihong Deng"
+    ],
+    "emails": [
+      "~Yue_Li5",
+      "~Yue_Huang5",
+      "~Shihong_Deng1",
+      "~Dongyang_Zhao1",
+      "~Changnan_Xiao1"
+    ],
+    "rank": 2918
+  },
+  {
+    "url": "https://openreview.net/forum?id=iUTHidd-ylL",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.24",
+    "confidences": [
+      5,
+      4,
+      4,
+      4
+    ],
+    "abstract": "In this work, we present a fully convolutional end to end method to reconstruct corrupted sparse matrices of Non-Euclidean data. The classic example for such matrices is recommender systems matrices where the rows/columns represent items/users and the entries are ratings. The method we present is inspired by the surprising and spectacular success of methods like<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c22\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\"</mo></math></mjx-assistive-mml></mjx-container>  deep image prior<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c22\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\"</mo></math></mjx-assistive-mml></mjx-container>  and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2018\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2018\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2018</mo><mo>\u2018</mo></math></mjx-assistive-mml></mjx-container>deep decoder<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c22\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\"</mo></math></mjx-assistive-mml></mjx-container>  for corrupted image completion. In sharp contrast to previous Matrix Completion methods wherein the latent matrix or its factors directly serve as the optimization variable, in the method we present, the matrix is parameterized as the weights of a graph neural network acting on a random noisy input. Then we are tuning the network parameters to get a result as close as possible to the initial sparse matrix (using its factors) getting that way state of the art matrix completion result. In addition to the conceptual simplicity of our method, which is just Non-Euclidean generalization of deep image priors, it holds fewer parameters than previously presented methods which makes the parameters more trackable and the method more computationally efficient and more applicable for the real-world tasks. The method also achieves state-of-the-art results for the matrix completion task on the classical benchmarks in the field. The method also surprisingly shows that untrained convolutional neural network can use a good prior not only for image completion but also for Matrix Completion when redefined for graphs.",
+    "title": "Matrix Data Deep Decoder - Geometric Learning for Structured Data Completion",
+    "authors": [
+      "Maria Schmidt",
+      "Alexander Bronstein"
+    ],
+    "emails": [
+      "~Maria_Schmidt1",
+      "~Alexander_Bronstein1"
+    ],
+    "rank": 2919
+  },
+  {
+    "url": "https://openreview.net/forum?id=IlJbTsygaI6",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.24",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Deep Reinforcement Learning agents achieve state-of-the-art performance in many tasks at the cost of making them black-boxes, hard to interpret and understand, making their use difficult in trusted applications, such as robotics or industrial applications. We introduce goal-based interpretability, where the agent produces goals which show the reason for its current actions (reach the current goal) and future goals indicate its desired future behavior without having to run the environment, a useful property in environments with no simulator. Additionally, in many environments, the goals can be visualized to make them easier to understand for non-experts. To have a goal-producing agent without requiring domain knowledge, we use 2-layer hierarchical agents where the top layer produces goals and the bottom layer attempts to reach those goals. \n\nMost classical reinforcement learning algorithms cannot be used train  goal-producing hierarchical agents. We introduce a new algorithm to train these more interpretable agents, called HAC-General with Teacher, an extension of the Hindsight Actor-Critic (HAC)  algorithm that adds 2 key improvements: (1) the goals now consist of a state <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>s</mi></math></mjx-assistive-mml></mjx-container> to be reached and a reward <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>r</mi></math></mjx-assistive-mml></mjx-container> to be collected, making it possible for the goal-producing policy to incentivize the goal-reaching policy to go through high-reward paths and (2) an expert teacher is leveraged to improve the training of the hierarchical agent, in a process similar but distinct to imitation learning and distillation. Contrarily to HAC, there is no requirement that environments need to provide the desired end state. Additionally, our experiments show that it has better performance and learns faster than HAC, and can solve environments that HAC fails to solve.",
+    "title": "Explainable Reinforcement Learning Through Goal-Based Interpretability",
+    "authors": [
+      "Gregory Bonaert",
+      "Youri Coppens",
+      "Denis Steckelmacher",
+      "Ann Nowe"
+    ],
+    "emails": [
+      "~Denis_Steckelmacher1",
+      "~Youri_Coppens1",
+      "~Ann_Nowe1",
+      "~Gregory_Bonaert1"
+    ],
+    "rank": 2920
+  },
+  {
+    "url": "https://openreview.net/forum?id=wUUKCAmBx6q",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      4,
+      3
+    ],
+    "rating": "3.23",
+    "confidences": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "abstract": "This paper presents and investigates a novel and timely application domain for deep learning: sub-second traffic flow modelling in IP networks. Traffic flows are the most fundamental components in an IP based networking system. The accurate modelling of the generative patterns of these flows is crucial for many practical network applications. However, the high nonlinearity and dynamics of both the traffic and network conditions make this task challenging, particularly at the time granularity of sub-second. In this paper, we cast this problem as a representation learning task to model the intricate patterns in data traffic according to the IP network structure and working mechanism. Accordingly, we propose a customized Flow Neural Network, which works in a self-supervised way to extract the domain-specific data correlations. We report the state-of-the-art performances on both synthetic and realistic traffic patterns on multiple practical network applications, which provides a good testament to the strength of our approach.",
+    "title": "Flow Neural Network for Traffic Flow Modelling in IP Networks",
+    "authors": [
+      "Xiangle Cheng",
+      "Yuchen He",
+      "Feifei Long",
+      "Shihan Xiao",
+      "Fenglin Li"
+    ],
+    "emails": [
+      "~Xiangle_Cheng1",
+      "uwaterloo.ca",
+      "huawei.com"
+    ],
+    "rank": 2921
+  },
+  {
+    "url": "https://openreview.net/forum?id=C5th0zC9NPQ",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      3
+    ],
+    "rating": "3.23",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Situated cognition depends on accessing environmental state through sensors. Engineering and cost constraints usually lead to limited \u201cpathways\u201d where, for example, a vision sub-system only includes a camera and the software to deal with it. This traditional and rational design style entails any hardware defect on the pathway causes the system to grind to a halt until repair. We propose a \u201csensoriplexer\u201d as drop-in neural component architecture to address this issue, under the common scenario of multiple sensors availability. This component architecture learns to mix and relate pathways, such that an agent facing failure in a sensory sub-system can degrade gracefully and coherently by relying on its other sub- systems. The architecture is inspired by the concept of synesthesia, and relies on statistical coupling between sensor signals. We show the benefit and limitation of the architecture on a simple shape recognition and a more complex emotion recognition scenarios.",
+    "title": "Sensory Resilience based on Synesthesia",
+    "authors": [
+      "Eric Platon",
+      "Tom Sonoda"
+    ],
+    "emails": [
+      "~Eric_Platon1",
+      "~Tom_Sonoda1"
+    ],
+    "rank": 2922
+  },
+  {
+    "url": "https://openreview.net/forum?id=LzhEvTWpzH",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      4,
+      4
+    ],
+    "rating": "3.22",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "In neural machine translation (NMT), data augmentation methods such as back-translation make it possible to use extra monolingual data to help improve translation performance, while it needs extra training data and the in-domain monolingual data is not always available. In this paper, we present a novel data augmentation method for neural machine translation by using only the original training data without extra data. More accurately, we randomly replace words or mixup with their aligned alternatives in another language when training neural machine translation models. Since aligned word pairs appear in the same position of each other during training, it is helpful to form bilingual embeddings which are proved useful to provide a performance boost \\citep{liu2019shared}. Experiments on both small and large scale datasets show that our method significantly outperforms the baseline models.",
+    "title": "Switching-Aligned-Words Data Augmentation for Neural Machine Translation",
+    "authors": [
+      "Fengshun Xiao",
+      "Zuchao Li",
+      "hai zhao"
+    ],
+    "emails": [
+      "~Fengshun_Xiao1",
+      "~Zuchao_Li1",
+      "~hai_zhao1"
+    ],
+    "rank": 2923
+  },
+  {
+    "url": "https://openreview.net/forum?id=XKgo1UfNRx8",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.22",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Ensemble Deep Learning improves accuracy over a single model by combining predictions from multiple models.  It has established itself to be the core strategy for tackling the most difficult problems, like winning Kaggle challenges. Due to the lack of consensus to design a successful deep learning ensemble, we introduce Hyperband-Dijkstra, a new workflow that automatically explores neural network designs with Hyperband and efficiently combines them with Dijkstra's algorithm.  This workflow has the same training cost than standard Hyperband running except sub-optimal  solutions are stored  and  are  candidates  to  be  selected in  the  ensemble  selection  step (recycling). Next, to predict on new data, the user gives to Dijkstra the maximum number of models wanted in  the  ensemble to control the tradeoff between accuracy and inference time.  \nHyperband is a very efficient algorithm allocating exponentially more resources to the most promising configurations. It is also capable to propose diverse models due to its pure-exploration nature, which allows Dijkstra algorithm with a smart combination of diverse models to achieve a strong variance and bias reduction. The exploding number of possible combinations generated by Hyperband increases the probability that Dijkstra finds an accurate combination which fits the dataset and generalizes on new data.\nThe two experimentation on CIFAR100 and on our unbalanced microfossils dataset show that our new workflow generates an ensemble far more accurate than any other ensemble of any ResNet models from ResNet18 to ResNet152.",
+    "title": "Recycling sub-optimial Hyperparameter Optimization models to generate efficient Ensemble Deep Learning",
+    "authors": [
+      "Pierrick Pochelu",
+      "Bruno Conche",
+      "Serge G. Petiton"
+    ],
+    "emails": [
+      "~Pierrick_Pochelu1",
+      "univ-lille.fr",
+      "total.com"
+    ],
+    "rank": 2924
+  },
+  {
+    "url": "https://openreview.net/forum?id=r6I3EvB9eDO",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      3
+    ],
+    "rating": "3.20",
+    "confidences": [
+      4,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Humans leverage compositionality for flexible and efficient learning, but current machine learning algorithms lack such ability. Despite many efforts in specific cases, there is still absence of theories and tools to study it systematically. In this paper, we leverage group theory to mathematically prove necessary and sufficient conditions for two fundamental questions of compositional representations. (1) What are the properties for a set of components to be expressed compositionally. (2) What are the properties for mappings between compositional and entangled representations. We provide examples to better understand the conditions and how to apply them. E.g., we use the theory to give a new explanation of why attention mechanism helps compositionality. We hope this work will help to advance understanding of compositionality and improvement of artificial intelligence towards human level.\n",
+    "title": "Necessary and Sufficient Conditions for Compositional Representations",
+    "authors": [
+      "Yuanpeng Li"
+    ],
+    "emails": [
+      "~Yuanpeng_Li2"
+    ],
+    "rank": 2925
+  },
+  {
+    "url": "https://openreview.net/forum?id=pbUcKxmiM54",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.19",
+    "confidences": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Learning for Deductive Reasoning is an open problem in the machine learning world today. \nDeductive reasoning involves storing facts in memory and generation of newer facts over time. \nThe concept of memory, processor and code in deduction systems is fundamentally different from the purpose and formulation of weights in a deep neural network. \nA majority of the machine learning models are inductive reasoning models including state of the art deep neural networks which are effectively tensor interpolation based models.\nA step towards realization of memory is through recurrent neural networks and its variants, however the formal representation is not sufficient enough to capture a complex mapping function between input and output patterns.\nDeep neural networks are positioned to do away with feature engineering which is essentially deductive reasoning methodology.\nThere are existing works in deductive reasoning in neural networks that require learning of syntax, unification and deduction and operate on text data as sequence of tokens.\nHowever the performance of deductive reasoning networks is far from perfection which may be either due to syntax or deduction aspects.\nIn this context, we have proposed a suite of completely numeric data sets which do not require parsing as with text data.\nThe 10 data sets are for - (a) selection (3 data sets) - minimum, maximum and top 2nd element in an array of numbers; (b) matching (3 data sets) - duplicate detection, counting and histogram learning; (c) divisibility tests (2 data sets) - divisibility of two numbers and divisibility by 3; (d) representation (2 data sets) - binary representation and parity. \nThough extremely simple in terms of feature engineering, in all of these tests, simple deep neural networks, random forest and recurrent neural networks have failed with very low accuracies. \nWe propose these as numerical test-bed for testing learning models for deductive reasoning.",
+    "title": "Simple deductive reasoning tests and numerical data sets for exposing limitation of today's deep neural networks",
+    "authors": [
+      "Kalidas Yeturu",
+      "Manish Kumar Srivastava"
+    ],
+    "emails": [
+      "~Kalidas_Yeturu1",
+      "~Manish_Kumar_Srivastava1"
+    ],
+    "rank": 2926
+  },
+  {
+    "url": "https://openreview.net/forum?id=S7KQa2XYpiQ",
+    "ratings": [
+      4,
+      3,
+      2,
+      4
+    ],
+    "rating": "3.18",
+    "confidences": [
+      4,
+      4,
+      5,
+      4
+    ],
+    "abstract": "In this work, we focus on improving the novel class generalization of few-shot learning. By addressing the difference between feature distributions of base and novel classes, we propose the adaptive feature distribution method which is to finetune one scale vector using the support set of novel classes. The scale vector is applied on the normalized feature distribution and by using one scale vector to reshape the feature space manifold, we obtain consistent performance improvement for both in-domain and cross-domain evaluations. By simply finetuning one scale vector using 5 images, we observe a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-n\"><mjx-c class=\"mjx-c25\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>2.23</mn><mi mathvariant=\"normal\">%</mi></math></mjx-assistive-mml></mjx-container> performance boost on 5-way 1-shot cross-domain evaluation with CUB over statistics results of 2000 episodes. This approach is simple yet effective. By just finetuning a single scale vector we provide a solution of reducing number of parameters while still obtain generalization ability for few-shot learning. We achieve the state-of-the-art performance on mini-Imagenet, tiered-Imagenet as well as cross-domain evaluation on CUB.",
+    "title": "Improve Novel Class Generalization By Adaptive Feature Distribution for Few-Shot Learning",
+    "authors": [
+      "Ran Tao",
+      "Marios Savvides"
+    ],
+    "emails": [
+      "~Marios_Savvides1",
+      "~Ran_Tao2"
+    ],
+    "rank": 2927
+  },
+  {
+    "url": "https://openreview.net/forum?id=rQYyXqHPgZR",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      3,
+      2
+    ],
+    "rating": "3.18",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Current reinforcement learning generally uses discounted return as its learning objective. However, real-world tasks may often demand a high success rate, which can be quite different from optimizing rewards. In this paper, we explicitly formulate the success rate as an undiscounted form of return with {0, 1}-binary reward function. Unfortunately, applying traditional Bellman updates to value function learning can be problematic for learning undiscounted return, and thus not suitable for optimizing success rate. From our theoretical analysis, we discover that values across different states tend to converge to the same value, resulting in the agent wandering around those states without making any actual progress. This further leads to reduced learning efficiency and inability to complete a task in time. To combat the aforementioned issue, we propose a new method, which introduces Loop Penalty (LP) into value function learning, to penalize disoriented cycling behaviors in the agent's decision-making. We demonstrate the effectiveness of our proposed LP on three environments, including grid-world cliff-walking, Doom first-person navigation and robot arm control, and compare our method with Q-learning, Monte-Carlo and Proximal Policy Optimization (PPO). Empirically, LP improves the convergence of training and achieves a higher success rate.",
+    "title": "Success-Rate Targeted Reinforcement Learning by Disorientation Penalty",
+    "authors": [
+      "Haichuan Gao",
+      "Zhile Yang",
+      "Tian Tan",
+      "Feng Chen"
+    ],
+    "emails": [
+      "~Tian_Tan1",
+      "~Feng_Chen1",
+      "~Zhile_Yang1",
+      "~Haichuan_Gao1"
+    ],
+    "rank": 2928
+  },
+  {
+    "url": "https://openreview.net/forum?id=Rq31tXaqXq",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      3,
+      3
+    ],
+    "rating": "3.16",
+    "confidences": [
+      4,
+      4,
+      3,
+      4,
+      4
+    ],
+    "abstract": "The past years have witnessed an explosion of deep learning frameworks like PyTorch and TensorFlow since the success of deep neural networks. These frameworks have significantly facilitated algorithm development in multimedia research and production. However, how to easily and efficiently build an end-to-end visual analysis pipeline with these algorithms is still an open issue. In most cases, developers have to spend a huge amount of time tackling data input and output, optimizing computation efficiency, or even debugging exhausting memory leaks together with algorithm development. VideoFlow aims to overcome these challenges by providing a flexible, efficient, extensible, and secure visual analysis framework for both the academia and industry. With VideoFlow, developers can focus on the improvement of algorithms themselves, as well as the construction of a complete visual analysis workflow. VideoFlow has been incubated in the practices of smart city innovation for more than three years. It has been widely used in tens of intelligent visual analysis systems. VideoFlow will be open-sourced at \\url{<a href=\"https://github.com/xxx/videoflow}\" target=\"_blank\" rel=\"nofollow\">https://github.com/xxx/videoflow}</a>.",
+    "title": "VideoFlow: A Framework for Building Visual Analysis Pipelines",
+    "authors": [
+      "Yue Wu",
+      "Jianqiang Huang",
+      "Jiangjie Zhen",
+      "Guokun Wang",
+      "Chen Shen",
+      "Chang Zhou",
+      "Xian-Sheng Hua"
+    ],
+    "emails": [
+      "alibaba-inc.com",
+      "~Yue_Wu18"
+    ],
+    "rank": 2929
+  },
+  {
+    "url": "https://openreview.net/forum?id=S7Aeama_0s",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      5,
+      4,
+      2
+    ],
+    "rating": "3.14",
+    "confidences": [
+      4,
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Learning high-dimensional probability distributions by competitively training generative and discriminative neural networks is a prominent approach of Generative Adversarial Networks (GANs) among generative models to model complex real-world data. Nevertheless, training GANs likely suffer from non-convergence problem, mode collapse and gradient explosion or vanishing. Least Squares GAN (LSGANs) and Wasserstein GANs (WGAN) are of representative variants of GANs in literature that diminish the inherent problems of GANs by proposing the modification methodology of loss functions. However, LSGANs often fall into local minima and cause mode collapse. While WGANs unexpectedly encounter with inefficient computation and slow training due to its constraints in Wasserstein distance approximation. In this paper, we propose Quantile Regression GAN (QRGAN) in which quantile regression is adopted to minimize 1-Wasserstein distance between real and generated data distribution as a novel approach in modification of loss functions for improvement of GANs. To study the culprits of mode collapse problem, the output space of discriminator and gradients of fake samples are analyzed to see if the discriminator guides the generator well. And we found that the discriminator should not be bounded to specific numbers. Our proposed QRGAN exposes high robustness against mode collapse problem. Furthermore, QRGAN obtains an apparent improvement in the evaluation and comparison of Frechet Inception Distance (FID) for generation performance assessment compared to existing variants of GANs.",
+    "title": "QRGAN: Quantile Regression Generative Adversarial Networks",
+    "authors": [
+      "Sunyeop Lee",
+      "Tuan Anh Nguyen",
+      "Dugki Min"
+    ],
+    "emails": [
+      "~Tuan_Anh_Nguyen3",
+      "gmail.com",
+      "konkuk.ac.kr"
+    ],
+    "rank": 2930
+  },
+  {
+    "url": "https://openreview.net/forum?id=IazZhsJK7wJ",
+    "ratings": [
+      4,
+      3,
+      4,
+      2
+    ],
+    "rating": "3.13",
+    "confidences": [
+      3,
+      3,
+      4,
+      5
+    ],
+    "abstract": "This paper aims to solve a series of referential problems in sequence decoding caused by data sparsity and corpus scarce in low-resource Neural Machine Translation (NMT), including pronoun missing, reference error, bias and so on. It is difficult to find the essential reason of these problems because they are only shown in the prediction results and involve all aspects of the model. Different from the usual solutions based on complex mathematical rule setting and adding artificial features, we expect to turn the problems in the predictions into noise as much as possible, and use adversarial training to make the model find the balance between the noise and the golden samples, instead of exploring the reason of the problem during the complex training. In this paper, only a simple noise-based preprocessing operation and a slight modification of the adversarial training can make the model generalize to a series of referential problems in low-resource NMT task. On Korean-Chinese, Mongolian-Chinese and Arabic-Chinese tasks, the evaluation of BLEU score and the accuracy of pronouns in sequence have been significantly improved.",
+    "title": "A Simple and General Strategy for Referential Problem in Low-Resource Neural Machine Translation",
+    "authors": [
+      "Yatu Ji",
+      "Nier Wu",
+      "Hongxu Hou"
+    ],
+    "emails": [
+      "~Nier_Wu1",
+      "~Hongxu_Hou1",
+      "~Yatu_Ji1"
+    ],
+    "rank": 2931
+  },
+  {
+    "url": "https://openreview.net/forum?id=jnRqf0CzBK",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      4,
+      2,
+      3
+    ],
+    "rating": "3.12",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We present a novel blind source separation (BSS) method, called information geometric blind source separation (IGBSS). Our formulation is based on the log-linear model equipped with a hierarchically structured sample space, which has theoretical guarantees to uniquely recover a set of source signals by minimizing the KL divergence from a set of mixed signals. Source signals, received signals, and mixing matrices are realized as different layers in our hierarchical sample space. Our empirical results have demonstrated on images and time series data that our approach is superior to well established techniques and is able to separate signals with complex interactions.",
+    "title": "Hierarchical Probabilistic Model for Blind Source Separation via Legendre Transformation",
+    "authors": [
+      "Simon Luo",
+      "Lamiae Azizi",
+      "Mahito Sugiyama"
+    ],
+    "emails": [
+      "~Simon_Luo1",
+      "~Mahito_Sugiyama1",
+      "sydney.edu.au"
+    ],
+    "rank": 2932
+  },
+  {
+    "url": "https://openreview.net/forum?id=-RQVWPX73VP",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      2,
+      4,
+      3,
+      4
+    ],
+    "rating": "3.11",
+    "confidences": [
+      3,
+      5,
+      4,
+      4,
+      3
+    ],
+    "abstract": "Meta-reinforcement learning (meta-RL) algorithms have successfully trained agent systems to perform well on different tasks within only few updates. However, in gradient-based meta-RL algorithms, the Q-function at adaptation step is mainly estimated by the return of few trajectories, which can lead to high variance in Q-value and biased meta-gradient estimation, and the adaptation uses a large number of batched trajectories. To address these challenges, we propose a new meta-RL algorithm that can reduce the variance and bias of the meta-gradient estimation and perform few-shot task data sampling, which makes the meta-policy more interpretable. We reformulate the meta-RL objective, and introduce contextual Q-function as a meta-policy critic during task adaptation step and learn the Q-function under a soft actor-critic (SAC) framework. The experimental results on 2D navigation task and meta-RL benchmarks show that our approach can learn an more interpretable meta-policy to explore unknown environment and the performance are comparable to previous gradient-based algorithms.",
+    "title": "Interpretable Meta-Reinforcement Learning with Actor-Critic Method",
+    "authors": [
+      "Xingyuan Liang",
+      "Xu-Ying Liu"
+    ],
+    "emails": [
+      "~Xingyuan_Liang1",
+      "seu.edu.cn"
+    ],
+    "rank": 2933
+  },
+  {
+    "url": "https://openreview.net/forum?id=VMAesov3dfU",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      1,
+      4,
+      3
+    ],
+    "rating": "3.08",
+    "confidences": [
+      3,
+      4,
+      3,
+      3
+    ],
+    "abstract": "In this paper, we argue that gradient descent is one of the reasons that make compositionality learning hard during neural network optimization. We find that the optimization process imposes a bias toward non-compositional solutions. This is caused by gradient descent, trying to use all available and redundant information from input, violating the conditional independence property of compositionality. Based on this finding, we suggest that compositionality learning approaches considering only model architecture design are unlikely to achieve complete compositionality. This is the first work to investigate the relation between compositional learning and gradient descent. We hope this study provides novel insights into compositional generalization, and forms a basis for new research directions to equip machine learning models with such skills for human-level intelligence.",
+    "title": "Gradient Descent Resists Compositionality",
+    "authors": [
+      "Yuanpeng Li",
+      "Liang Zhao",
+      "Joel Hestness",
+      "Kenneth Church",
+      "Mohamed Elhoseiny"
+    ],
+    "emails": [
+      "~Joel_Hestness2",
+      "~Yuanpeng_Li2",
+      "~Kenneth_Church1",
+      "~Liang_Zhao2",
+      "~Mohamed_Elhoseiny1"
+    ],
+    "rank": 2934
+  },
+  {
+    "url": "https://openreview.net/forum?id=6YuRviF_FC-",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      2,
+      4
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      5,
+      5
+    ],
+    "abstract": "Calibration is the most critical data processing step needed for generating images of high dynamic range \\citep{editioncasa}. With ever-increasing data volumes produced by modern radio telescopes \\cite{aniyan2017classifying}, astronomers are overwhelmed by the amount of data that needs to be manually processed and analyzed using limited computational resources \\citep{yatawatta2020stochastic}. Therefore, intelligent and automated systems are required to overcome these challenges. Traditionally, astronomers use a package such as Common Astronomy Software Applications (CASA) to compute the gain solutions based on regular observations of a known calibrator source \\citep{thompson2017interferometry} \\citep{abebe2015study} \\citep{grobler2016calibration} \\citep{editioncasa}. The traditional approach to calibration is iterative and time-consuming \\citep{jajarmizadeh2017optimal}, thus, the proposal of machine learning techniques. The applications of machine learning have created an opportunity to deal with complex problems currently encountered in radio astronomy data processing \\citep{aniyan2017classifying}. In this work, we propose the use of supervised machine learning models to first generation calibration (1GC), using the KAT-7 telescope environmental and pointing sensor data recorded during observations. Applying machine learning to 1GC, as opposed to calculating the gain solutions in CASA, has shown evidence of reducing computation, as well as accurately predicting the 1GC gain solutions and antenna behaviour. These methods are computationally less expensive, however they have not fully learned to generalise in predicting accurate 1GC solutions by looking at environmental and pointing sensors. We use an ensemble multi-output regression models based on random forest, decision trees, extremely randomized trees and K-nearest neighbor algorithms. The average prediction error obtained during the testing of our models on testing data is <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2248\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c39\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mo>\u2248</mo><mn>0.01</mn><mo>&lt;</mo><mi>r</mi><mi>m</mi><mi>s</mi><mi>e</mi><mo>&lt;</mo><mn>0.09</mn></math></mjx-assistive-mml></mjx-container> for gain amplitude per antenna, and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mi class=\"mjx-i\" space=\"4\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45A TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D460 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D452 TEX-I\"></mjx-c></mjx-mi><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3C\"></mjx-c></mjx-mo><mjx-mn class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c30\"></mjx-c><mjx-c class=\"mjx-c2E\"></mjx-c><mjx-c class=\"mjx-c35\"></mjx-c></mjx-mn><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45F TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D44E TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>0.2</mn><mi>r</mi><mi>a</mi><mi>d</mi><mo>&lt;</mo><mi>r</mi><mi>m</mi><mi>s</mi><mi>e</mi><mo>&lt;</mo><mn>0.5</mn><mi>r</mi><mi>a</mi><mi>d</mi></math></mjx-assistive-mml></mjx-container> for gain phase. This shows that the instrumental parameters used to train our model strongly correlate with gain amplitude effects than a phase.\n",
+    "title": "ZCal: Machine learning methods for calibrating radio interferometric data ",
+    "authors": [
+      "Simphiwe Zitha",
+      "Arun aniyan",
+      "Oleg Smirnov",
+      "Risuna Nkolele"
+    ],
+    "emails": [
+      "gmail.com",
+      "~Simphiwe_Zitha1"
+    ],
+    "rank": 2935
+  },
+  {
+    "url": "https://openreview.net/forum?id=9UFIOHeVEh",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      3,
+      5,
+      4
+    ],
+    "abstract": "In image-based object classification, the visual appearance of objects determines which class they are assigned to. External variables that are independent of the object, such as the perspective or the lighting conditions, can modify the object's appearance resulting in ambiguous images that lead to misclassifications. Previous work has proposed methods for estimating the uncertainty of predictions and measure their confidence. However, such methods do not indicate which variables are the potential sources that cause uncertainty. In this paper, we propose a method for image-based object classification that uses disentangled representations to indicate which are the external variables that contribute the most to the uncertainty of the predictions. This information can be used to identify the external variables that should be modified to decrease the uncertainty and improve the classification.",
+    "title": "Identifying the Sources of Uncertainty in Object Classification",
+    "authors": [
+      "Luis Armando P\u00e9rez Rey",
+      "Berk \u0130\u015fler",
+      "Mike Holenderski",
+      "Dmitri Jarnikov"
+    ],
+    "emails": [
+      "~Mike_Holenderski1",
+      "gmail.com",
+      "tue.nl",
+      "~Luis_Armando_P\u00e9rez_Rey1"
+    ],
+    "rank": 2936
+  },
+  {
+    "url": "https://openreview.net/forum?id=tmvULc0laNv",
+    "ratings": [
+      4,
+      3,
+      2,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "The autoencoder model uses an encoder to map data samples to a lower dimensional latent space and then a decoder to map the latent space representations back to the data space. Implicitly, it relies on the encoder to approximate the inverse of the decoder network, so that samples can be mapped to and back from the latent space faithfully. This approximation may lead to sub-optimal latent space representations. In this work, we investigate a decoder-only method that uses gradient flow to encode data samples in the latent space. The gradient flow is defined based on a given decoder and aims to find the optimal latent space representation for any given sample through optimisation, eliminating the need of an approximate inversion through an encoder. Implementing gradient flow through ordinary differential equations (ODE), we leverage the adjoint method to train a given decoder. We further show empirically that the costly integrals in the adjoint method may not be entirely necessary. Additionally, we propose a <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45B TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D451 TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mn>2</mn><mrow><mi>n</mi><mi>d</mi></mrow></msup></math></mjx-assistive-mml></mjx-container> order ODE variant to the method, which approximates Nesterov's accelerated gradient descent, with faster convergence per iteration. Commonly used ODE solvers can be quite sensitive to the integration step-size depending on the stiffness of the ODE. To overcome the sensitivity for gradient flow encoding, we use an adaptive solver that prioritises minimising loss at each integration step. We assess the proposed method in comparison to the autoencoding model. In our experiments, GFE showed a much higher data-efficiency than the autoencoding model, which can be crucial for data scarce applications.",
+    "title": "Gradient flow encoding with distance optimization adaptive step size",
+    "authors": [
+      "Kyriakos Flouris",
+      "Anna Volokitin",
+      "Gustav Bredell",
+      "Ender Konukoglu"
+    ],
+    "emails": [
+      "~Kyriakos_Flouris1",
+      "~Anna_Volokitin1",
+      "vision.ee.ethz.ch",
+      "~Ender_Konukoglu1"
+    ],
+    "rank": 2937
+  },
+  {
+    "url": "https://openreview.net/forum?id=t4hNn7IvNZX",
+    "decision": "Reject",
+    "ratings": [
+      6,
+      3,
+      2,
+      2
+    ],
+    "rating": "3.00",
+    "confidences": [
+      3,
+      4,
+      5,
+      4
+    ],
+    "abstract": "The robustness of deep neural networks against adversarial example attacks has received much attention recently. We focus on certified robustness of smoothed classifiers in this work, and propose to use the worst-case population loss over noisy inputs as a robustness metric. Under this metric, we provide a tractable upper bound serving as a robustness certificate by exploiting the duality. To improve the robustness, we further propose a noisy adversarial learning procedure to minimize the upper bound following the robust optimization framework. The smoothness of the loss function ensures the problem easy to optimize even for non-smooth neural networks. We show how our robustness certificate compares with others and the improvement over previous works. Experiments on a variety of datasets and models verify that in terms of empirical accuracies, our approach exceeds the state-of-the-art certified/heuristic methods in defending adversarial examples.",
+    "title": "Certified Distributional Robustness via Smoothed Classifiers ",
+    "authors": [
+      "Jungang Yang",
+      "Liyao Xiang",
+      "Ruidong Chen",
+      "Yukun Wang",
+      "Wei Wang",
+      "Xinbing Wang"
+    ],
+    "emails": [
+      "~Xinbing_Wang1",
+      "sjtu.edu.cn",
+      "~Jungang_Yang2",
+      "~Liyao_Xiang1",
+      "~Wei_Wang50"
+    ],
+    "rank": 2938
+  },
+  {
+    "url": "https://openreview.net/forum?id=-aThAo4b1zn",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      2,
+      2,
+      4
+    ],
+    "rating": "3.00",
+    "confidences": [
+      3,
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Recently, self-supervised learning (SSL) algorithms have been applied to Few-shot learning(FSL). FSL aims at distilling transferable knowledge on existing classes with large-scale labeled data to cope with novel classes for which only a few labeled data are available. Due to the limited number of novel classes, the initial embedding network becomes an essential component and can largely affect the performance in practice. But almost no one analyzes why a pre-trained embedding network with self-supervised training can provide representation for downstream FSL tasks in theory. In this paper, we first summarized the supervised FSL methods and explained why SSL is suitable for FSL. Then we further analyzed the main difference between supervised training and self-supervised training on FSL and obtained the bound for the gap between self-supervised loss and supervised loss. Finally, we proposed potential ways to improve the test accuracy under the setting of self-supervised FSL. ",
+    "title": "A Theory of Self-Supervised Framework for Few-Shot Learning",
+    "authors": [
+      "Zhong Cao",
+      "Jiang Lu",
+      "Jian Liang",
+      "Changshui Zhang"
+    ],
+    "emails": [
+      "~Changshui_Zhang2",
+      "~Zhong_Cao1",
+      "~Jian_Liang3",
+      "~Jiang_Lu1"
+    ],
+    "rank": 2939
+  },
+  {
+    "url": "https://openreview.net/forum?id=86PW5gch8VZ",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      4,
+      2
+    ],
+    "rating": "3.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Gradient quantization is widely adopted to mitigate communication costs in distributed learning systems. Existing gradient quantization algorithms often rely on design heuristics and/or empirical evidence to tune the quantization strategy for different learning problems. To the best of our knowledge, there is no theoretical framework characterizing the trade-off between communication cost and model accuracy under dynamic gradient quantization strategies. This paper addresses this issue by proposing a novel dynamic quantized SGD (DQSGD) framework, which enables us to optimize the quantization strategy for each gradient descent step by exploring the trade-off between communication cost and modeling error. In particular, we derive an upper bound, tight in some cases, of the modeling error for arbitrary dynamic quantization strategy. By minimizing this upper bound, we obtain an enhanced quantization algorithm with significantly improved modeling error under given communication overhead constraints. Besides, we show that our quantization scheme achieves a strengthened communication cost and model accuracy trade-off in a wide range of optimization models. Finally, through extensive experiments on large-scale computer vision and natural language processing tasks on CIFAR-10, CIFAR-100, and AG-News datasets, respectively. we demonstrate that our quantization scheme significantly outperforms the state-of-the-art gradient quantization methods in terms of communication costs.",
+    "title": "DQSGD: DYNAMIC QUANTIZED STOCHASTIC GRADIENT DESCENT FOR COMMUNICATION-EFFICIENT DISTRIBUTED LEARNING",
+    "authors": [
+      "Guangfeng Yan",
+      "Shao-Lun Huang",
+      "Tian Lan",
+      "Linqi Song"
+    ],
+    "emails": [
+      "~Guangfeng_Yan1",
+      "~Linqi_Song1",
+      "~Shao-Lun_Huang1",
+      "~Tian_Lan4"
+    ],
+    "rank": 2940
+  },
+  {
+    "url": "https://openreview.net/forum?id=e6hMkY6MFcU",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      4
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      3,
+      4
+    ],
+    "abstract": "Black box attacks on traditional deep learning models trained for text classifica- tion target important words in a piece of text, in order to change model prediction. Current approaches towards highlighting important features are time consuming and require large number of model queries. We present a simple yet novel method to calculate word importance scores, based on model predictions on single words. These scores, which we call WordsWorth scores, need to be calculated only once for the training vocabulary. They can be used to speed up any attack method that requires word importance, with negligible loss of attack performance. We run ex- periments on a number of datasets trained on word-level CNNs and LSTMs, for sentiment analysis and topic classification and compare to state-of-the-art base- lines. Our results show the effectiveness of our method in attacking these models with success rates that are close to the original baselines. We argue that global importance scores act as a very good proxy for word importance in a local context because words are a highly informative form of data. This aligns with the manner in which humans interpret language, with individual words having well- defined meaning and powerful connotations. We further show that these scores can be used as a debugging tool to interpret a trained model by highlighting rele- vant words for each class. Additionally, we demonstrate the effect of overtraining on word importance, compare the robustness of CNNs and LSTMs, and explain the transferability of adversarial examples across a CNN and an LSTM using these scores. We highlight the fact that neural networks make highly informative pre- dictions on single words.",
+    "title": "WordsWorth Scores for Attacking CNNs and LSTMs for Text Classification",
+    "authors": [
+      "Nimrah Shakeel"
+    ],
+    "emails": [
+      "~Nimrah_Shakeel1"
+    ],
+    "rank": 2941
+  },
+  {
+    "url": "https://openreview.net/forum?id=VwU1lyi5nzb",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      1,
+      4,
+      5
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "Question-answering (QA) models aim to find an answer given a question and con- text. Language models like BERT are used to associate question and context to find an answer span. Prior art on QA focuses on finding the best answer. There is a need for multi-span QA models to output the top-K likely answers to questions such as \"Which companies Elon Musk started?\" or \"What factors cause global warming?\" In this work, we introduce Span-Image architecture that can learn to identify multiple answers in a context for a given question. This architecture can incorporate prior information about the span length distribution or valid span patterns (e.g., end index has to be larger than start index), thus eliminating the need for post-processing. Span-Image architecture outperforms the state-of-the-art in top-K answer accuracy on SQuAD dataset and in multi-span answer accuracy on an Amazon internal dataset.\n",
+    "title": "MULTI-SPAN QUESTION ANSWERING USING SPAN-IMAGE NETWORK",
+    "authors": [
+      "Tarik Arici",
+      "Hayreddin Ceker",
+      "Ismail Baha Tutar"
+    ],
+    "emails": [
+      "amazon.com",
+      "~Hayreddin_Ceker1"
+    ],
+    "rank": 2942
+  },
+  {
+    "url": "https://openreview.net/forum?id=UEtNMTl6yN",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      2,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      5,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Tasks such as graph classification, require graph pooling to learn graph-level representations from constituent node representations. In this work, we propose two novel methods using fully connected neural network layers for graph pooling, namely Neural Pooling Method 1 and 2. Our proposed methods have the ability to handle variable number of nodes in different graphs, and are also invariant to the isomorphic structures of graphs. In addition, compared to existing graph pooling methods, our proposed methods are able to capture information from all nodes, collect second-order statistics, and leverage the ability of neural networks to learn relationships among node representations, making them more powerful. We perform experiments on graph classification tasks in the bio-informatics and social network domains to determine the effectiveness of our proposed methods. Experimental results show that our methods lead to an absolute increase of upto 1.2% in classification accuracy over previous works and a general decrease in standard deviation across multiple runs indicating greater reliability. Experimental results also indicate that this improvement in performance is consistent across several datasets.\n",
+    "title": "Neural Pooling for Graph Neural Networks",
+    "authors": [
+      "Sai Sree Harsha",
+      "Deepak Mishra"
+    ],
+    "emails": [
+      "iist.ac.in",
+      "~Sai_Sree_Harsha1"
+    ],
+    "rank": 2943
+  },
+  {
+    "url": "https://openreview.net/forum?id=nEMiSX_ipXr",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "This paper analyzes the problems of adversarial accuracy and adversarial training. We argue that standard adversarial accuracy fails to properly measure the robustness of classifiers. Its definition has a tradeoff with standard accuracy even when we neglect generalization. In order to handle the problems of the standard adversarial accuracy, we introduce a new measure for the robustness of classifiers called genuine adversarial accuracy. It can measure the adversarial robustness of classifiers without trading off accuracy on clean data and accuracy on the adversarially perturbed samples. In addition, it does not favor a model with invariance-based adversarial examples, samples whose predicted classes are unchanged even if the perceptual classes are changed. We prove that a single nearest neighbor (1-NN) classifier is the most robust classifier according to genuine adversarial accuracy for given data and a norm-based distance metric when the class for each data point is unique. Based on this result, we suggest that using poor distance metrics might be one factor for the tradeoff between test accuracy and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D459 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>l</mi><mi>p</mi></msub></math></mjx-assistive-mml></mjx-container> norm-based test adversarial robustness.",
+    "title": "Proper Measure for Adversarial Robustness",
+    "authors": [
+      "Hyeongji Kim",
+      "Ketil Malde"
+    ],
+    "emails": [
+      "malde.org",
+      "~Hyeongji_Kim1"
+    ],
+    "rank": 2944
+  },
+  {
+    "url": "https://openreview.net/forum?id=5PiSFHhRe2C",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      4
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      5,
+      4
+    ],
+    "abstract": "Aspect based sentiment analysis (ABSA) is a challenging natural language processing task that could benefit from syntactic information. Previous work exploit dependency parses to improve performance on the task, but this requires the existence of good dependency parsers.  In this paper, we build a constituent-based transformer for ABSA that can induce constituents without constituent parsers. We also apply meta auxiliary learning to generate labels on edges between tokens, supervised by the objective of the ABSA task.  Without input from dependency parsers, our models outperform previous work on three Twitter data sets and match previous work closely on two review data sets.",
+    "title": "Meta Auxiliary Labels with Constituent-based Transformer for Aspect-based Sentiment Analysis",
+    "authors": [
+      "Ling Min Serena Khoo",
+      "Hai Leong Chieu"
+    ],
+    "emails": [
+      "~Hai_Leong_Chieu1",
+      "~Ling_Min_Serena_Khoo1"
+    ],
+    "rank": 2945
+  },
+  {
+    "url": "https://openreview.net/forum?id=xiwHM0l55c3",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      3,
+      2,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "In this paper, we are interested in building a domain knowledge based deep learning framework to solve the chiller plants energy optimization problems. Compared to the hotspot applications of deep learning (e.g. image classification and NLP), it is difficult to collect enormous data for deep network training in real-world physical systems. Most existing methods reduce the complex systems into linear model to facilitate the training on small samples. To tackle the small sample size problem, this paper considers domain knowledge in the structure and loss design of deep network to build a nonlinear model with lower redundancy function space. Specifically, the energy consumption estimation of most chillers can be physically viewed as an input-output monotonic problem. Thus, we can design a Neural Network with monotonic constraints to mimic the physical behavior of the system. We verify the proposed method in a cooling system of a data center, experimental results show the superiority of our framework in energy optimization compared to the existing ones.",
+    "title": "Monotonic neural network: combining deep learning with domain knowledge for chiller plants energy optimization",
+    "authors": [
+      "Fanhe Ma",
+      "Faen Zhang",
+      "Shenglan Ben",
+      "Shuxin Qin",
+      "pengcheng Zhou",
+      "Changsheng Zhou",
+      "Fengyi Xu"
+    ],
+    "emails": [
+      "ainnovation.com",
+      "~Fanhe_Ma1",
+      "~Shenglan_Ben2",
+      "~Shuxin_Qin1"
+    ],
+    "rank": 2946
+  },
+  {
+    "url": "https://openreview.net/forum?id=mT1d_6PewBl",
+    "ratings": [
+      5,
+      3,
+      2
+    ],
+    "rating": "3.00",
+    "confidences": [
+      2,
+      2,
+      4
+    ],
+    "abstract": "The paper studies the impacts of an aggregative Convolutional Neural Network (CNN) for the derivation of generalized artificial intelligence when starting from a specialized set of CNNs. The aggregation proposed relies on machine learning from specific examples being the categorical probabilities fed by the downstream specialized CNNs under consideration. Aggregation proof of concepts  are provided in terms of multimodel, multimodal and distributed schemes. The multimodel framework is such that different CNN models operating on the same modality cooperate for decision purpose. The multimodal framework  implies specializations of CNNs with respect to  different input modalities. The distributed framework proposed is associated with assessment exchanges: it such that the aggregation aims at determining relevant joint assessments for mapping a given input to a single or a multiple output category. Performance of these aggregation frameworks are shown to be outstanding for both standard and extreme classification issues.",
+    "title": "Towards Generalized Artificial Intelligence by Assessment Aggregation with Applications to Standard and Extreme Classifications",
+    "authors": [
+      "Abdourrahmane M ATTO"
+    ],
+    "emails": [
+      "~Abdourrahmane_M_ATTO1"
+    ],
+    "rank": 2947
+  },
+  {
+    "url": "https://openreview.net/forum?id=okT7QRhSYBw",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      4,
+      3,
+      3
+    ],
+    "abstract": "Deep networks have been revolutionary in improving performance of machine learning and artificial intelligence systems.\nTheir high prediction accuracy, however, comes at a price of model irreproducibility in very high levels that do not occur with classical linear models. Two models, even if they are supposedly identical, with identical architecture and identical trained parameter sets, and that are trained on the same set of training examples, while possibly providing identical average prediction accuracies, may predict very differently on individual, previously unseen, examples. Prediction differences may be as large as the order of magnitude of the predictions themselves. Ensembles have been shown to somewhat mitigate this behavior, but without an extra push, may not be utilizing their full potential. In this work, a novel approach, Anti-Distillation, is proposed to address irreproducibility in deep networks, where ensemble models are used to generate predictions. Anti-Distillation forces ensemble components away from one another by techniques like de-correlating their outputs over mini-batches of examples, forcing them to become even more different and more diverse.  Doing so enhances the benefit of ensembles, making the final predictions more reproducible. Empirical results demonstrate substantial prediction difference reductions achieved by Anti-Distillation on benchmark and real datasets.",
+    "title": "Anti-Distillation: Improving Reproducibility of Deep Networks",
+    "authors": [
+      "Gil I Shamir",
+      "Lorenzo Coviello"
+    ],
+    "emails": [
+      "google.com",
+      "~Lorenzo_Coviello1"
+    ],
+    "rank": 2948
+  },
+  {
+    "url": "https://openreview.net/forum?id=ERAQ5ZCP9t",
+    "ratings": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Multi-view data has become ubiquitous, especially with multi-sensor systems like self-driving cars or medical patient-side monitors.\n\nWe look at modeling multi-view data through robust representation learning, with the goal of leveraging relationships between views and building resilience to missing information.\nWe propose a new flavor of multi-view AutoEncoders, the Robust Multi-view AutoEncoder, which explicitly encourages robustness to missing views.\nThe principle we use is straightforward: we apply the idea of drop-out to the level of views.\nDuring training, we leave out views as input to our model while forcing it to reconstruct all of them.\nWe also consider a flow-based generative modeling extension of our approach in the case where all the views are available.\n\nWe conduct experiments for different scenarios: directly using the learned representations for reconstruction, as well as a two-step process where the learned representation is subsequently used as features for the data for a down-stream application.\nOur synthetic and real-world experiments show promising results for the application of these models to robust representation learning.",
+    "title": "Robust Multi-view Representation Learning",
+    "authors": [
+      "Sibi Venkatesan",
+      "Kyle Miller",
+      "Artur Dubrawski"
+    ],
+    "emails": [
+      "~Sibi_Venkatesan1",
+      "~Kyle_Miller1",
+      "~Artur_Dubrawski2"
+    ],
+    "rank": 2949
+  },
+  {
+    "url": "https://openreview.net/forum?id=hga6dk7nxFB",
+    "ratings": [
+      3,
+      3,
+      3,
+      3
+    ],
+    "rating": "3.00",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Modern sequencing technology has produced a vast quantity of proteomic data, which has been key to the development of various deep learning models within the field. However, there are still challenges to overcome with regards to modelling the properties of a protein, especially when labelled resources are scarce. Developing interpretable deep learning models is an essential criterion, as proteomics research requires methods to understand the functional properties of proteins. The ability to derive quality information from both the model and the data will play a vital role in the advancement of proteomics research. In this paper, we seek to leverage a BERT model that has been pre-trained on a vast quantity of proteomic data, to model a collection of regression tasks using only a minimal amount of data. We adopt a triplet network structure to fine-tune the BERT model for each dataset and evaluate its performance on a set of downstream task predictions: plasma membrane localisation, thermostability, peak absorption wavelength, and enantioselectivity.  Our results significantly improve upon the original BERT baseline as well as the previous state-of-the-art models for each task, demonstrating the benefits of using a triplet network for refining such a large pre-trained model on a limited dataset. As a form of white-box deep learning, we also visualise how the model attends to specific parts of the protein and how the model detects critical modifications that change its overall function. ",
+    "title": "Deep Learning Proteins using a Triplet-BERT network",
+    "authors": [
+      "Mark Lennox",
+      "Neil M. Robertson",
+      "Barry Devereux"
+    ],
+    "emails": [
+      "~Neil_M._Robertson1",
+      "~Barry_Devereux1",
+      "~Mark_Lennox1"
+    ],
+    "rank": 2950
+  },
+  {
+    "url": "https://openreview.net/forum?id=RB0iNPXIj60",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      2,
+      4,
+      2
+    ],
+    "rating": "2.95",
+    "confidences": [
+      4,
+      5,
+      5,
+      5
+    ],
+    "abstract": "We present a conceptually simple yet powerful and flexible scheme for refining predictions of bounding boxes. Our approach is trained standalone on GT boxes and can then be combined with an object detector to improve its predictions. The method, called BBRefinement, uses mixture data of image information and the object's class and center. Due to the transformation of the problem into a domain where BBRefinement does not care about multiscale detection, recognition of the object's class, computing confidence, or multiple detections, the training is much more effective. It results in the ability to refine even COCO's ground truth labels into a more precise form. BBRefinement improves the performance of SOTA architectures up to 2mAP points on the COCO dataset in the benchmark. The refinement process is fast; it adds 50-80ms overhead to a standard detector using RTX2080, so it can run in real-time on standard hardware. The code is available at <a href=\"https://gitlab.com/irafm-ai/bb-refinement\" target=\"_blank\" rel=\"nofollow\">https://gitlab.com/irafm-ai/bb-refinement</a>.",
+    "title": "BBRefinement: an universal scheme to improve precision of box object detectors",
+    "authors": [
+      "Petr Hurtik",
+      "Marek Vajgl"
+    ],
+    "emails": [
+      "~Marek_Vajgl1",
+      "~Petr_Hurtik1"
+    ],
+    "rank": 2951
+  },
+  {
+    "url": "https://openreview.net/forum?id=g4szfsQUdy3",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      3,
+      3
+    ],
+    "rating": "2.94",
+    "confidences": [
+      5,
+      4,
+      3,
+      4
+    ],
+    "abstract": "Random label noises (or observational noises) widely exist in practical machinelearning settings. we analyze the learning dynamics of stochastic gradient descent(SGD) over the quadratic loss with unbiased label noises, and investigate a newnoise term of dynamics, which is dynamized and influenced by mini-batch sam-pling and random label noises, as an implicit regularizer. Our theoretical analysisfinds such implicit regularizer would favor some convergence points that could stabilize model outputs against perturbation of parameters. To validate our analy-sis, we use our theorems to estimate the closed-form solution of the implicit reg-ularizer over continuous-time SGD dynamics for Ordinary Least-Square (OLS), where the numerical simulation backups our estimates. We further extend our proposals to interpret the newly-fashioned noisy self-distillation tricks for deep learning,  where the implicit regularizer demonstrates a unique capacity of selecting models with improved output stability through learning from well-trained teach-ers with additive unbiased random label noises",
+    "title": "Implicit Regularization Effects of Unbiased Random Label Noises with SGD",
+    "authors": [
+      "Haoyi Xiong",
+      "Xuhong Li",
+      "Boyang Yu",
+      "Dejing Dou",
+      "Dongrui Wu",
+      "Zhanxing Zhu"
+    ],
+    "emails": [
+      "~Dongrui_Wu1",
+      "~Boyang_Yu1",
+      "~Haoyi_Xiong1",
+      "~Zhanxing_Zhu1",
+      "~Dejing_Dou1",
+      "~Xuhong_Li3"
+    ],
+    "rank": 2952
+  },
+  {
+    "url": "https://openreview.net/forum?id=Qe_de8HpWK",
+    "decision": "Reject",
+    "ratings": [
+      4,
+      2,
+      3,
+      3
+    ],
+    "rating": "2.94",
+    "confidences": [
+      4,
+      5,
+      4,
+      5
+    ],
+    "abstract": "Deep neural network-powered artificial intelligence has rapidly changed our daily life with various applications. However, as one of the essential steps of deep neural networks, training a heavily-weighted network requires a tremendous amount of computing resources. Especially in the post Moore's Law era, the limit of semiconductor fabrication technology has restricted the development of learning algorithms to cope with the increasing high-intensity training data. Meanwhile, quantum computing has exhibited its significant potential in terms of speeding up the traditionally compute-intensive workloads. For example, Google illustrates quantum supremacy by completing a sampling calculation task in 200 seconds, which is otherwise impracticable on the world's largest supercomputers. To this end, quantum-based learning becomes an area of interest, with the promising of a quantum speedup. In this paper, we propose GenQu, a hybrid and general-purpose quantum framework for learning classical data through quantum states. We evaluate GenQu with real datasets and conduct experiments on both simulations and real quantum computer IBM-Q. Our evaluation demonstrates that, comparing with classical solutions, the proposed models running on GenQu framework achieve similar accuracy with a much smaller number of qubits, while significantly reducing the parameter size by up to 95.8\\% and converging speedup by 66.67% faster. ",
+    "title": "GenQu: A Hybrid System for Learning Classical Data in Quantum States",
+    "authors": [
+      "Samuel A. Stein",
+      "Ray Marie Tischio",
+      "Betis Baheri",
+      "Yiwen Chen",
+      "Ying Mao",
+      "Qiang Guan",
+      "Ang Li",
+      "Bo Fang"
+    ],
+    "emails": [
+      "~Ying_Mao1",
+      "fordham.edu",
+      "pnnl.gov",
+      "kent.edu"
+    ],
+    "rank": 2953
+  },
+  {
+    "url": "https://openreview.net/forum?id=PI_CwQparl_",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      2
+    ],
+    "rating": "2.94",
+    "confidences": [
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "In this conceptual work, we present DCGMM, a deep hierarchical Gaussian Mixture Model (GMM) that is particularly suited for describing and generating images.\nVanilla (i.e., \"flat\") GMMs require a very large number of components to well describe images, leading to long training times and memory issues. \nDCGMMs avoid this by a stacked architecture of multiple GMM layers, linked by convolution and pooling operations. \nThis allows to exploit the compositionality of images in a similar way as deep CNNs do.\nThis sets them apart from vanilla GMMs which are trained by EM, requiring a prior k-means initialization which is infeasible in a layered structure.\nFor generating sharp images with DCGMM, we introduce a new gradient-based technique for sampling through non-invertible operations like convolution and pooling.\nBased on the MNIST and FashionMNIST datasets, we validate the DCGMM model by demonstrating its superiority over \"flat\" GMMs for clustering, sampling and outlier detection.\nWe additionally demonstrate the applicability of DCGMM to variant generation, in-painting and class-conditional sampling. ",
+    "title": "Image Modeling with Deep Convolutional Gaussian Mixture Models",
+    "authors": [
+      "Alexander Gepperth",
+      "Benedikt Pf\u00fclb"
+    ],
+    "emails": [
+      "~Benedikt_Pf\u00fclb1",
+      "~Alexander_Gepperth1"
+    ],
+    "rank": 2954
+  },
+  {
+    "url": "https://openreview.net/forum?id=_qJXkf347k",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      2,
+      4,
+      3
+    ],
+    "rating": "2.94",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Latency of DNN (Deep Neural Network) based prediction is the summation of model loading latency and inference latency. Model loading latency affects the first response from the applications, whereas inference latency affects the subsequent responses. As model loading latency is directly proportional to the model size, this work aims at improving the response time of an intelligent app by reducing the loading latency. The speedup is gained by asymmetrically modularizing the given DNN model among several small child models and loading them in parallel. The decision about number of feasible child models and their corresponding split positions are taken care by reinforcement learning unit (RLU). RLU takes into account the available hardware resources on-device and provides the best splitting index <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> and their positions  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.085em; margin-bottom: -0.516em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c20D7 TEX-V\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>p</mi><mo stretchy=\"false\">\u2192</mo></mover></mrow></math></mjx-assistive-mml></mjx-container> specific to the DNN model and device, where  <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"2\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-texatom texclass=\"ORD\"><mjx-mover><mjx-over style=\"padding-bottom: 0.06em; padding-left: 0.085em; margin-bottom: -0.516em;\"><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c20D7 TEX-V\"></mjx-c></mjx-mo></mjx-over><mjx-base><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi></mjx-base></mjx-mover></mjx-texatom><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c3D\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"4\"><mjx-c class=\"mjx-c28\"></mjx-c></mjx-mo><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c31\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mn class=\"mjx-n\" size=\"s\"><mjx-c class=\"mjx-c32\"></mjx-c></mjx-mn></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2E\"></mjx-c></mjx-mo><mjx-mo class=\"mjx-n\" space=\"2\"><mjx-c class=\"mjx-c2C\"></mjx-c></mjx-mo><mjx-msub space=\"2\"><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub><mjx-mo class=\"mjx-n\"><mjx-c class=\"mjx-c29\"></mjx-c></mjx-mo></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mrow><mover><mi>p</mi><mo stretchy=\"false\">\u2192</mo></mover></mrow><mo>=</mo><mo stretchy=\"false\">(</mo><msub><mi>p</mi><mn>1</mn></msub><mo>,</mo><msub><mi>p</mi><mn>2</mn></msub><mo>,</mo><mo>.</mo><mo>.</mo><mo>.</mo><mo>,</mo><msub><mi>p</mi><mi>k</mi></msub><mo stretchy=\"false\">)</mo></math></mjx-assistive-mml></mjx-container> and <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"3\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D45D TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>p</mi><mi>i</mi></msub></math></mjx-assistive-mml></mjx-container>  is the end position of <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"4\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msup><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: 0.363em;\"><mjx-texatom size=\"s\" texclass=\"ORD\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D461 TEX-I\"></mjx-c></mjx-mi><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c210E TEX-I\"></mjx-c></mjx-mi></mjx-texatom></mjx-script></mjx-msup></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msup><mi>i</mi><mrow><mi>t</mi><mi>h</mi></mrow></msup></math></mjx-assistive-mml></mjx-container>  child: <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"5\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-msub><mjx-mi class=\"mjx-i\" noic=\"true\"><mjx-c class=\"mjx-c1D440 TEX-I\"></mjx-c></mjx-mi><mjx-script style=\"vertical-align: -0.15em;\"><mjx-mi class=\"mjx-i\" size=\"s\"><mjx-c class=\"mjx-c1D456 TEX-I\"></mjx-c></mjx-mi></mjx-script></mjx-msub></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi>M</mi><mi>i</mi></msub></math></mjx-assistive-mml></mjx-container>. The proposed method has shown significant loading improvement (up to 7X) on popular DNNs, used for camera use-case. The proposed method can be used to speed up the app response. Along with that RLU driven approach facilitates for On-device personalization by separating one module only with trainable layers and loading that particular module while training on-device. ",
+    "title": "Reinforcement Learning Based Asymmetrical DNN Modularization for Optimal Loading",
+    "authors": [
+      "Brijraj Singh",
+      "Yash Jain",
+      "Mayukh Das",
+      "Praveen Doreswamy Naidu"
+    ],
+    "emails": [
+      "~Mayukh_Das1",
+      "samsung.com",
+      "~Brijraj_Singh1"
+    ],
+    "rank": 2955
+  },
+  {
+    "url": "https://openreview.net/forum?id=d_Ue2glvcY8",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      2,
+      3
+    ],
+    "rating": "2.89",
+    "confidences": [
+      4,
+      5,
+      5,
+      4
+    ],
+    "abstract": "Controlling the presented forms (or structures) of generated text are as important as controlling the generated contents during neural text generation. It helps to reduce the uncertainty and improve the interpretability of generated text. However, the structures and contents are entangled together and realized simultaneously during text generation, which is challenging for the structure controlling. In this paper, we propose an efficient, straightforward generation framework to control the structure of generated text. A structure-aware transformer (SAT) is proposed to explicitly incorporate multiple types of multi-granularity structure information to guide the text generation with corresponding structure. The structure information is extracted from given sequence template by auxiliary model, and the type of structure for the given template can be learned, represented and imitated. Extensive experiments have been conducted on both Chinese lyrics corpus and English Penn Treebank dataset. Both automatic evaluation metrics and human judgement demonstrate the superior capability of our model in controlling the structure of generated text, and the quality ( like Fluency and Meaningfulness) of the generated text is even better than the state-of-the-arts model.\n",
+    "title": "Structure Controllable Text Generation",
+    "authors": [
+      "Liming DENG",
+      "Long WANG",
+      "Binzhu WANG",
+      "Jiang Qian",
+      "Bojin Zhuang",
+      "Shaojun Wang",
+      "Jing Xiao"
+    ],
+    "emails": [
+      "pingan.com.cn",
+      "~Shaojun_Wang1",
+      "~Jiang_Qian1",
+      "~Jing_Xiao3",
+      "~Liming_DENG1",
+      "gmail.com",
+      "~Bojin_Zhuang2"
+    ],
+    "rank": 2956
+  },
+  {
+    "url": "https://openreview.net/forum?id=a7E9TUTXWAT",
+    "ratings": [
+      3,
+      2,
+      3,
+      4,
+      3
+    ],
+    "rating": "2.89",
+    "confidences": [
+      3,
+      5,
+      3,
+      3,
+      4
+    ],
+    "abstract": "Estimating future events is a difficult task. Unlike humans, machine learning ap- proaches are not regularized by a natural understanding of physics. In the wild, a plausible succession of events is governed by the rules of causality, which can- not easily be derived from a finite training set. In this paper we propose a novel theoretical framework to perform causal future prediction by embedding spatio- temporal information on a Minkowski space-time. We utilize the concept of a light cone from special relativity to restrict and traverse the latent space of an arbitrary model. We demonstrate successful applications in causal image synthe- sis and future video frame prediction on a dataset of images. Our framework is architecture- and task-independent and comes with strong theoretical guarantees of causal capabilities.",
+    "title": "Causal Future Prediction in a Minkowski Space-Time",
+    "authors": [
+      "Athanasios Vlontzos",
+      "Henrique Bergallo Rocha",
+      "Daniel Rueckert",
+      "Bernhard Kainz"
+    ],
+    "emails": [
+      "~Athanasios_Vlontzos1",
+      "ed.ac.uk",
+      "~Daniel_Rueckert2",
+      "~Bernhard_Kainz1"
+    ],
+    "rank": 2957
+  },
+  {
+    "url": "https://openreview.net/forum?id=AZWHo-jkA_Q",
+    "ratings": [
+      2,
+      2,
+      4,
+      4
+    ],
+    "rating": "2.88",
+    "confidences": [
+      4,
+      5,
+      4,
+      3
+    ],
+    "abstract": "We propose to train a multilayer perceptron simultaneously as an encoder and a decoder in order to create a high quality generative model. In one call a network is optimized as either an encoder or decoder, and in a second recursive call the network uses its own outputs to learn the remaining corresponding function, allowing for the minimization of popular statistical divergence measures over a single feed-forward function. This new approach derives from a simple reformulation of variational bayes and extends naturally to the domain of Generative Adversarial Nets. Here we demonstrate a single network which learns a generative model via an adversarial minimax game played against itself. Experiments demonstrate comparable efficacy for the single-network approach versus corresponding multi-network formulations.",
+    "title": "Generative modeling with one recursive network",
+    "authors": [
+      "Benjamin Lincoln Brimacombe"
+    ],
+    "emails": [
+      "~Benjamin_Lincoln_Brimacombe1"
+    ],
+    "rank": 2958
+  },
+  {
+    "url": "https://openreview.net/forum?id=FN7_BUOG78e",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      4,
+      3,
+      2
+    ],
+    "rating": "2.88",
+    "confidences": [
+      4,
+      3,
+      4,
+      5
+    ],
+    "abstract": "To apply an algorithm in a sensitive domain it is important to understand the set of input values that result in specific decisions. Deep neural networks suffer from an inherent instability that makes this difficult: different outputs can arise from very similar inputs. \n\nWe present a method to check that the decisions of a deep neural network are as intended by constructing the exact, analytical preimage of its predictions. Preimages generalize verification in the sense that they can be used to verify a wide class of properties, and answer much richer questions besides. We examine the functioning and failures of neural networks used in robotics, including an aircraft collision avoidance system, related to sequential decision making and extrapolation.\n\nOur method iterates backwards through the layers of piecewise linear deep neural networks. Uniquely, we compute \\emph{all} intermediate values that correspond to a prediction, propagating this calculation through layers using analytical formulae for layer preimages. \n\n",
+    "title": "Computing Preimages of Deep Neural Networks with Applications to Safety",
+    "authors": [
+      "Kyle Matoba",
+      "Fran\u00e7ois Fleuret"
+    ],
+    "emails": [
+      "~Kyle_Matoba1",
+      "~Fran\u00e7ois_Fleuret2"
+    ],
+    "rank": 2959
+  },
+  {
+    "url": "https://openreview.net/forum?id=GHCu1utcBvX",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      4,
+      3
+    ],
+    "rating": "2.88",
+    "confidences": [
+      5,
+      3,
+      3,
+      5
+    ],
+    "abstract": "Compositional generalization is the algebraic capacity to understand and produce large amount of novel combinations from known components. It is a key element of human intelligence for out-of-distribution generalization. To equip neural networks with such ability, many algorithms have been proposed to extract compositional representations from the training distribution. However, it has not been discussed whether the trained model can still extract such representations in the test distribution. In this paper, we argue that the extraction ability does not transfer naturally, because the extraction network suffers from the divergence of distributions. To address this problem, we propose to use an auxiliary reconstruction network with regularized hidden representations as input, and optimize the representations during inference. The proposed approach significantly improves accuracy, showing more than a 20% absolute increase in various experiments compared with baselines. To our best knowledge, this is the first work to focus on the transferability of compositionality, and it is orthogonal to existing efforts of learning compositional representations in training distribution. We hope this work will help to advance compositional generalization and artificial intelligence research.",
+    "title": "Transferability of Compositionality",
+    "authors": [
+      "Yuanpeng Li",
+      "Liang Zhao",
+      "Joel Hestness",
+      "Ka Yee Lun",
+      "Kenneth Church",
+      "Mohamed Elhoseiny"
+    ],
+    "emails": [
+      "~Joel_Hestness2",
+      "gmail.com",
+      "~Yuanpeng_Li2",
+      "~Kenneth_Church1",
+      "~Liang_Zhao2",
+      "~Mohamed_Elhoseiny1"
+    ],
+    "rank": 2960
+  },
+  {
+    "url": "https://openreview.net/forum?id=24-DxeAe2af",
+    "decision": "Reject",
+    "ratings": [
+      5,
+      2,
+      2,
+      3
+    ],
+    "rating": "2.81",
+    "confidences": [
+      3,
+      5,
+      4,
+      4
+    ],
+    "abstract": "A copy number variant (CNV) is a type of genetic mutation where a stretch of DNA is lost or duplicated once or multiple times. CNVs play important roles in the development of diseases and complex traits. CNV detection with short-read DNA sequencing technology is challenging because CNVs significantly vary in size and are similar to DNA sequencing artifacts. Many methods have been developed but still yield unsatisfactory results with high computational costs. Here, we propose CNV-Net, a novel approach for CNV detection using a six-layer convolutional neural network. We encode DNA sequencing information into RGB images and train the convolutional neural network with these images. The fitted convolutional neural network can then be used to predict CNVs from DNA sequencing data. We benchmark CNV-Net with two high-quality whole-genome sequencing datasets available from the Genome in a Bottle Consortium, considered as gold standard benchmarking datasets for CNV detection. We demonstrate that CNV-Net is more accurate and efficient in CNV detection than current tools.",
+    "title": "Accurate and fast detection of copy number variations from short-read whole-genome sequencing with deep convolutional neural network",
+    "authors": [
+      "Jiajin Li",
+      "Stephen Hwang",
+      "Luke Zhang",
+      "Jae Hoon Sul"
+    ],
+    "emails": [
+      "~Jiajin_Li3",
+      "gmail.com",
+      "ucsc.edu",
+      "~Jae_Hoon_Sul1"
+    ],
+    "rank": 2961
+  },
+  {
+    "url": "https://openreview.net/forum?id=aia4HejvBmY",
+    "ratings": [
+      3,
+      3,
+      3,
+      3,
+      2
+    ],
+    "rating": "2.78",
+    "confidences": [
+      4,
+      5,
+      4,
+      5,
+      5
+    ],
+    "abstract": "Wildfire has become an unavoidable natural disaster that continues to threaten fire-prone communities and the frequency is expected to increase due to climate change. Therefore, predicting a wildfire spread profile is an essential tool for firefighters when planning an evacuation strategy. The current traditional, physics, and empirically based fire spread models require extensive inputs, which are often difficult to obtain. Thus, we propose a 3D Convolutional Neural Network (CNN), named WildfireNet, that can predict the profile of wildfire of the next day when given historical wildfire profiles and accessible remote-sensing data. WildfireNet utilizes 3-dimensional spaces to extract features from both the temporal and spatial dimensions to better understand the relationship between historical fires and upcoming fires. The motivation behind WildfireNet is to locate fires in a precise manner and be able to accurately predict fire profiles. Pixels that were labeled as fire but not on the previous days were extracted to calculate Intersection over Union (IoU) and recall. WildfireNet outperformed 2D CNN and logistic regression model in both IoU and recall.\n",
+    "title": "A 3D Convolutional Neural Network for Predicting Wildfire Profiles",
+    "authors": [
+      "Samuel Sung",
+      "Yuping Li",
+      "Leonard Ortolano"
+    ],
+    "emails": [
+      "stanford.edu",
+      "~Samuel_Sung1"
+    ],
+    "rank": 2962
+  },
+  {
+    "url": "https://openreview.net/forum?id=kNQSWUrUGI_",
+    "ratings": [
+      3,
+      3,
+      3,
+      2
+    ],
+    "rating": "2.75",
+    "confidences": [
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "Non-i.i.d data distribution and Differential privacy(DP) protections are two open problems in Federated Learning(FL). We address these two problems by proposing the first noise intrinsic FL training algorithms. In our proposed algorithm, we incorporate a stochastic gradient Langevin dynamices(SGLD) oracle in local node's parameter update phase. Our introduced SGLD oracle would lower generalization errors in local node's parameter learning and provide local node DP protections. We theoretically analyze our algorithm by formulating a min-max objective functions and connects its upper bound with global loss function in FL. The convergence of our algorithm on non-convex function is also given as contraction and coupling rate of two random process defined by stochastic differential equations(SDE) We would provide DP analysis for our proposed training algorithm and provide more experiment results soon.",
+    "title": "A Stochastic Gradient Langevin Dynamics Algorithm For Noise Intrinsic Federated Learning",
+    "authors": [
+      "Yan Shen",
+      "Jian Du",
+      "Chunwei Ma",
+      "Mingchen Gao",
+      "Benyu Zhang"
+    ],
+    "emails": [
+      "~Benyu_Zhang1",
+      "~Yan_Shen1",
+      "~Chunwei_Ma1",
+      "~Jian_Du3",
+      "~Mingchen_Gao1"
+    ],
+    "rank": 2963
+  },
+  {
+    "url": "https://openreview.net/forum?id=ijVgDcvLmZ",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      2,
+      4,
+      2,
+      3
+    ],
+    "rating": "2.71",
+    "confidences": [
+      5,
+      4,
+      3,
+      5,
+      4
+    ],
+    "abstract": "We explore energy-based solutions for cooperative multi-agent reinforcement learning (MARL) using the idea of function factorization in centralized training with decentralized execution (CTDE). Existing CTDE based factorization methods are susceptible to the relative overgeneralization, where finding a suboptimal Nash Equilibrium, which is a well-known game-theoretic pathology. To resolve this issue, we propose a novel factorization method for cooperative MARL, named FSV, which learns to factorize the joint soft value function into individual ones for decentralized execution. Theoretical analysis shows that FSV solves a rich class of factorization tasks. Our experiment for the well-known task of the Max of Two Quadratics game shows that FSV fully converges to global optima in the joint action space in the continuous tasks by local searching in the joint action space. We evaluate FSV on a challenging set of StarCraft II micromanagement tasks, and show that FSV significantly outperforms existing factorization multi-agent reinforcement learning methods.",
+    "title": "FSV: Learning to Factorize Soft Value Function for Cooperative Multi-Agent Reinforcement Learning",
+    "authors": [
+      "Yueheng Li",
+      "Tianhao Zhang",
+      "Chen Wang",
+      "Jinan Sun",
+      "Shikun Zhang",
+      "Guangming Xie"
+    ],
+    "emails": [
+      "~Tianhao_Zhang5",
+      "pku.edu.cn",
+      "~Chen_Wang11",
+      "~Shikun_Zhang2",
+      "~Jinan_Sun1",
+      "~Guangming_Xie1"
+    ],
+    "rank": 2964
+  },
+  {
+    "url": "https://openreview.net/forum?id=eIPsmKwTrIe",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      4,
+      2
+    ],
+    "rating": "2.67",
+    "confidences": [
+      5,
+      4,
+      3
+    ],
+    "abstract": "We present STEP, a novel Deep Reinforcement Learning solution to the problem of learning instructional sequencing.  STEP has three components: 1. Simulate the student by fitting a knowledge tracing model to data logged by an intelligent tutoring system. 2. Train instructional sequencing policies by using Proximal Policy Optimization. 3. Evaluate the learned instructional policies by estimating their local and global impact on learning gains.  STEP leverages the student model by representing the student\u2019s knowledge state as a probability vector of knowing each skill and using the student\u2019s estimated learning gains as its reward function to evaluate candidate policies.  A learned policy represents a mapping from each state to an action that maximizes the reward, i.e. the upward distance to the next state in the multi-dimensional space. We use STEP to discover and evaluate potential improvements to a literacy and numeracy tutor used by hundreds of children in Tanzania.",
+    "title": "Using Deep Reinforcement Learning to Train and Evaluate Instructional Sequencing Policies for an Intelligent Tutoring System",
+    "authors": [
+      "Jithendaraa Subramanian",
+      "David Mostow"
+    ],
+    "emails": [
+      "~David_Mostow1",
+      "~Jithendaraa_Subramanian1"
+    ],
+    "rank": 2965
+  },
+  {
+    "url": "https://openreview.net/forum?id=l3gNU1KStIC",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      3,
+      4,
+      2,
+      2
+    ],
+    "rating": "2.58",
+    "confidences": [
+      4,
+      3,
+      2,
+      5,
+      5
+    ],
+    "abstract": "The goal of the inverse reinforcement learning (IRL) problem is to recover the reward functions from expert demonstrations. However, the IRL problem like any ill-posed inverse problem suffers the congenital defect that the policy may be optimal for many reward functions, and expert demonstrations may be optimal for many policies. In this work, we generalize the IRL problem to a well-posed expectation optimization problem stochastic inverse reinforcement learning (SIRL) to recover the probability distribution over reward functions. We adopt the Monte Carlo expectation-maximization (MCEM) method to estimate the parameter of the probability distribution as the first solution to the SIRL problem. The solution is succinct, robust, and transferable for a learning task and can generate alternative solutions to the IRL problem. Through our formulation, it is possible to observe the intrinsic property for the IRL problem from a global viewpoint, and our approach achieves a considerable performance on the objectworld. ",
+    "title": "Stochastic Inverse Reinforcement Learning ",
+    "authors": [
+      "Ce Ju"
+    ],
+    "emails": [
+      "~Ce_Ju1"
+    ],
+    "rank": 2966
+  },
+  {
+    "url": "https://openreview.net/forum?id=9GUTgHZgKCH",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      4,
+      2,
+      2
+    ],
+    "rating": "2.55",
+    "confidences": [
+      5,
+      4,
+      4,
+      4,
+      5
+    ],
+    "abstract": "We introduce a new Reduction Algorithm which makes use of the properties of ReLU neurons to reduce significantly the number of neurons in a trained Deep Neural Network. This algorithm is based on the recent theory of implicit and explicit regularization in Deep ReLU Networks from (Maennel et al, 2018) and the authors.\n\nWe discuss two experiments which illustrate the efficiency of the algorithm to reduce the number of neurons significantly with provably almost no change of the learned function within the training data (and therefore almost no loss in accuracy).",
+    "title": "Reducing the number of neurons of Deep ReLU Networks based on the current theory of Regularization",
+    "authors": [
+      "Jakob Heiss",
+      "Alexis Stockinger",
+      "Josef Teichmann"
+    ],
+    "emails": [
+      "~Alexis_Stockinger1",
+      "~Jakob_Heiss1",
+      "math.ethz.ch"
+    ],
+    "rank": 2967
+  },
+  {
+    "url": "https://openreview.net/forum?id=YTyHkF4P03w",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      1,
+      4,
+      3
+    ],
+    "rating": "2.50",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "Post-training dropout based approaches achieve high sparsity and are well established means of deciphering problems relating to computational cost and overfitting in Neural Network architectures. Contrastingly, pruning at initialization is still far behind. Initialization pruning is more efficacious when it comes to scaling computation cost of the network. Furthermore, it handles overfitting just as well as post training dropout. It is also averse to retraining losses.\n\nIn approbation of the above reasons, the paper presents two approaches to prune at initialization. The goal is to achieve higher sparsity while preserving performance. 1) K-starts, begins with k random p-sparse matrices at initialization. In the first couple of epochs the network then determines the \"fittest\" of these p-sparse matrices in an attempt to find the \"lottery ticket\" p-sparse network. The approach is adopted from how evolutionary algorithms find the best individual. Depending on the Neural Network architecture, fitness criteria can be based on magnitude of network weights, magnitude of gradient accumulation over an epoch or a combination of both. 2) Dissipating gradients approach, aims at eliminating weights that remain within a fraction of their initial value during the first couple of epochs. Removing weights in this manner despite their magnitude best preserves performance of the network. Contrarily, the approach also takes the most epochs to achieve higher sparsity. 3) Combination of dissipating gradients and kstarts outperforms either methods and random dropout consistently.\n\nThe benefits of using the provided pertaining approaches are: 1) They do not require specific knowledge of the classification task, fixing of dropout threshold or regularization parameters 2) Retraining of the model is neither necessary nor affects the performance of the p-sparse network.\n\nWe evaluate the efficacy of the said methods on Autoencoders and Fully Connected Multilayered Perceptrons. The datasets used are MNIST and Fashion MNIST.",
+    "title": "What to Prune and What Not to Prune at Initialization",
+    "authors": [
+      "Maham Haroon"
+    ],
+    "emails": [
+      "~Maham_Haroon1"
+    ],
+    "rank": 2968
+  },
+  {
+    "url": "https://openreview.net/forum?id=jyDpkM9lntb",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      2,
+      3
+    ],
+    "rating": "2.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "We present a new method for searching optimal hyperparameters among several tasks and several criteria. Multi-Task Multi Criteria method (MTMC) provides several Pareto-optimal solutions, among which one solution is selected with given criteria significance coefficients. The article begins with a mathematical formulation of the problem of choosing optimal hyperparameters. Then, the steps of the MTMC method that solves this problem are described. The proposed method is evaluated on the image classification problem using a convolutional neural network. The article presents optimal hyperparameters for various criteria significance coefficients. ",
+    "title": "Multi-Task Multicriteria Hyperparameter Optimization",
+    "authors": [
+      "Kirill Akhmetzyanov",
+      "Alexander Yuzhakov"
+    ],
+    "emails": [
+      "at.pstu.ru",
+      "~Kirill_Akhmetzyanov1"
+    ],
+    "rank": 2969
+  },
+  {
+    "url": "https://openreview.net/forum?id=2kImxCmYBic",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      3,
+      3,
+      2
+    ],
+    "rating": "2.50",
+    "confidences": [
+      4,
+      4,
+      4,
+      4
+    ],
+    "abstract": "Mainstream practice in machine learning with tabular data may take for granted that any feature engineering beyond scaling for numeric sets is superfluous in context of deep neural networks. This paper will offer arguments for potential benefits of extended encodings of numeric streams in deep learning by way of a survey of options for numeric transformations as available in the Automunge open source python library platform for tabular data pipelines, where transformations may be applied to distinct columns in \u201cfamily tree\u201d sets with generations and branches of derivations. Automunge transformation options include normalization, binning, noise injection, derivatives, and more. The aggregation of these methods into family tree sets of transformations are demonstrated for use to present numeric features to machine learning in multiple configurations of varying information content, as may be applied to encode numeric sets of unknown interpretation. Experiments demonstrate the realization of a novel generalized solution to data augmentation by noise injection for tabular learning, as may materially benefit model performance in applications with underserved training data.",
+    "title": "Numeric Encoding Options with Automunge",
+    "authors": [
+      "Nicholas Teague"
+    ],
+    "emails": [
+      "~Nicholas_Teague1"
+    ],
+    "rank": 2970
+  },
+  {
+    "url": "https://openreview.net/forum?id=K_ETaDx3Iv",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      2,
+      3,
+      2
+    ],
+    "rating": "2.50",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "The technology for automatic music generation has been very actively studied in recent years. However, almost in these studies, handling domain knowledge of music was omitted or considered a difficult task. In particular, research that analyzes and utilizes the characteristics of each bar of music is very rare, even though it is essential in the human composition. We propose a model that generate music with musical characteristics of bars by conditional generative adversarial network, and analyze the good combination of the sequence of which characterized bars  for symbolic-domain music generation by Recurrent Neural Network with Long short term memory layer. Also, by analyzing symbolic music data as image-like based on relational pitch approach, it increases the utilization of the data set with arbitrary chord scales and enables the use of generational results extensively. The resulting model FLAGNet generates music with the understanding of musical domain knowledge while handling inputs like minimum unit of note, length of music, chart scales, and chord condition.",
+    "title": "FLAGNet : Feature Label based Automatic Generation Network for symbolic music",
+    "authors": [
+      "SeongHyeon Go"
+    ],
+    "emails": [
+      "~SeongHyeon_Go2"
+    ],
+    "rank": 2971
+  },
+  {
+    "url": "https://openreview.net/forum?id=sgNhTKrZjaT",
+    "decision": "Reject",
+    "ratings": [
+      1,
+      4,
+      3,
+      2
+    ],
+    "rating": "2.44",
+    "confidences": [
+      5,
+      4,
+      5,
+      4
+    ],
+    "abstract": "Variational Auto Encoder (VAE) provide an efficient latent space representation of complex data distributions which is learned in an unsupervised fashion.\nUsing such a representation as input to Reinforcement Learning (RL) approaches may reduce learning time, enable domain transfer or improve interpretability of the model.\nHowever, current state-of-the-art approaches that combine VAE with RL fail at learning good performing policies on certain RL domains.\nTypically, the VAE is pre-trained in isolation and may omit the embedding of task-relevant features due to insufficiencies of its loss.\nAs a result, the RL approach can not successfully maximize the reward on these domains.\nTherefore, this paper investigates the issues of joint training approaches and explores incorporation of policy gradients from RL into the VAE's latent space to find a task-specific latent space representation.\nWe show that using pre-trained representations can lead to policies being unable to learn any rewarding behaviour in these environments.\nSubsequently, we introduce two types of models which overcome this deficiency by using policy gradients to learn the representation.\nThereby the models are able to embed features into its representation that are crucial for performance on the RL task but would not have been learned with previous methods.",
+    "title": "Guiding Representation Learning in Deep Generative Models with Policy Gradients",
+    "authors": [
+      "Luca Lach",
+      "Timo Korthals",
+      "Malte Schilling",
+      "Helge Ritter"
+    ],
+    "emails": [
+      "~Helge_Ritter1",
+      "~Malte_Schilling1",
+      "~Timo_Korthals1",
+      "~Luca_Lach1"
+    ],
+    "rank": 2972
+  },
+  {
+    "url": "https://openreview.net/forum?id=ry8_g12nVD",
+    "decision": "Reject",
+    "ratings": [
+      3,
+      2,
+      2
+    ],
+    "rating": "2.33",
+    "confidences": [
+      5,
+      5,
+      5
+    ],
+    "abstract": "Traditionally contact centers route an issue to an agent based on ticket load or skill of the agent. When a ticket comes into the system, it is either manually analyzed and pushed to an agent or automatically routed to an agent based on some business rules. A Customer Relationship Management (CRM) system often has predefined categories that an issue could belong to. The agents are generally proficient in handling multiple categories, the categories in the CRM system are often related to each other, and a ticket typically contains content across multiple categories. This makes the traditional approach sub-optimal. We propose a Hybrid Recommendation based approach that recommends top N agents for a ticket by jointly modelling on the interactions between the agents and categories as well as on the semantic features of the categories and the agents.",
+    "title": "SEMANTIC APPROACH TO AGENT ROUTING USING A HYBRID ATTRIBUTE-BASED RECOMMENDER SYSTEM",
+    "authors": [
+      "Anwitha Paruchuri"
+    ],
+    "emails": [
+      "~Anwitha_Paruchuri1"
+    ],
+    "rank": 2973
+  },
+  {
+    "url": "https://openreview.net/forum?id=xPw-dr5t1RH",
+    "decision": "Reject",
+    "ratings": [
+      2,
+      2,
+      2,
+      3
+    ],
+    "rating": "2.24",
+    "confidences": [
+      4,
+      5,
+      4,
+      4
+    ],
+    "abstract": "Embedding logical knowledge information into text generation is a challenging NLP task. In this paper, we propose a knowledge enhanced text generation (KETG) framework, which incorporates both the knowledge and associated text corpus to address logicality and diversity in text generation. Specifically, we validate our framework on rhetorical text generation from our newly built rhetoric knowledge graph. Experiments show that our framework outperforms baseline models such as Transformer and GPT-2, on rhetorical type control, semantic comprehensibility and diversity.",
+    "title": "KETG: A Knowledge Enhanced Text Generation Framework",
+    "authors": [
+      "Yan Cui",
+      "Xi Chen",
+      "Jiang Qian",
+      "Bojin Zhuang",
+      "Shaojun Wang",
+      "Jing Xiao"
+    ],
+    "emails": [
+      "~Shaojun_Wang1",
+      "~Yan_Cui3",
+      "~Jing_Xiao3",
+      "163.com",
+      "~Bojin_Zhuang2",
+      "~Jiang_Qian1"
+    ],
+    "rank": 2974
+  },
+  {
+    "url": "https://openreview.net/forum?id=fcZIsyzF6g",
+    "ratings": [
+      1,
+      3,
+      2,
+      3
+    ],
+    "rating": "2.22",
+    "confidences": [
+      5,
+      4,
+      4,
+      5
+    ],
+    "abstract": "As the complexity of our neural network models grow,  so too do the data and computation requirements for successful training.  One proposed solution to this problem is training on a distributed network of computational devices, thus distributing the computational and data storage loads.  This strategy has already seen some adoption by the likes of Google and other companies.  In this paper we propose a new method of distributed, decentralized learning that allows a network of computation nodes to coordinate their training using asynchronous updates over an unreliable network while only having access to a local dataset.  This is achieved by taking inspiration from Distributed Averaging Consensus algorithms to coordinate the various nodes.  Sharing the internal model instead of the training data allows the original raw data to remain with the computation node.  The asynchronous nature and lack of centralized coordination allows this paradigm to function with limited communication requirements.  We demonstrate our method on the MNIST, Fashion MNIST, and CIFAR10 datasets. We show that our coordination method allows models to be learned on highly biased datasets, and in the presence of intermittent communication failure.",
+    "title": "Consensus Driven Learning",
+    "authors": [
+      "Kyle Crandall",
+      "Dustin Webb"
+    ],
+    "emails": [
+      "aiinflux.com",
+      "~Kyle_Crandall1"
+    ],
+    "rank": 2975
+  },
+  {
+    "url": "https://openreview.net/forum?id=Iuq6u10sCdl",
+    "ratings": [
+      2,
+      3,
+      2,
+      2
+    ],
+    "rating": "2.17",
+    "confidences": [
+      5,
+      3,
+      5,
+      5
+    ],
+    "abstract": "Graphs have been ubiquitous in Machine Learning due to their versatile nature in modelling real world situations .Graph embedding is an important precursor to using graphs in Machine Learning , and much of performance of algorithms developed later depends heavily on this. However very little theoretical work exists in this area ,  resulting in the proliferation of several benchmarks without any mathematical validation , which is detrimental .In this paper we present an analysis of deterministic graph embedding in general , using tools from Functional Analysis and Topology . We prove several important results pertaining to graph embedding  which may have practical importance .One limitation of our work in it's present form is it's applicable to deterministic embedding approaches only, although we strongly hope to extend it to random graph embedding methods as well in future.We sincerely hope that this work will be beneficial  to researchers working in field of  graph embedding.",
+    "title": "GraphEmbeddingviaTopologyandFunctionalAnalysis",
+    "authors": [
+      "Phani raj Chinnalingu"
+    ],
+    "emails": [
+      "~Phani_raj_Chinnalingu1"
+    ],
+    "rank": 2976
+  },
+  {
+    "url": "https://openreview.net/forum?id=8Woqvdszj8B",
+    "ratings": [
+      2,
+      1,
+      4,
+      2
+    ],
+    "rating": "2.14",
+    "confidences": [
+      3,
+      4,
+      3,
+      4
+    ],
+    "abstract": "It is all but certain that machine learning models based on deep neural networks will soon feature ubiquitously in a wide variety of critical products and services that people rely on. This should be a major cause of concern given that we still lack a rigorous understanding of the failure modes of these systems, and can hardly make guarantees about the conditions under which the models are expected to work. In particular, we would like to understand how these models manage to generalize so well, even when seemingly overparametrized, effectively evading many of the intuitions expected from statistical learning theory. We argue that Distillation (Caruana et al., 2006, Hinton et al., 2014)  provides us with a rich playground for understanding what enables generalization in a concrete setting.  We carry out a precise high-dimensional analysis of generalization under distillation in a real world setting, eschewing ad hoc assumptions, and instead consider models actually encountered in the wild. ",
+    "title": "Illuminating Dark Knowledge via Random Matrix Ensembles",
+    "authors": [
+      "Anthony Ndirango"
+    ],
+    "emails": [
+      "~Anthony_Ndirango1"
+    ],
+    "rank": 2977
+  },
+  {
+    "url": "https://openreview.net/forum?id=cL4wkyoxyDJ",
+    "decision": "Reject",
+    "ratings": [
+      1,
+      2,
+      2,
+      3
+    ],
+    "rating": "2.00",
+    "confidences": [
+      5,
+      5,
+      5,
+      5
+    ],
+    "abstract": "Studies show that neural networks are susceptible to adversarial attacks. This exposes a potential threat to neural network-based artificial intelligence systems. We observe that the probability of the correct result outputted by the network increases by applying small perturbations generated for class labels other than the original predicted one to adversarial examples. Based on this observation, we propose a method of counteracting adversarial perturbations to resist adversarial examples. In our method, we randomly select a number of class labels and generate small perturbations for these selected labels. The generated perturbations are added together and then clamped to a specified space. The obtained perturbation is finally added to the adversarial example to counteract the adversarial perturbation contained in the example. The proposed method is applied at inference time and does not require retraining or finetuning the model. We validate the proposed method on CIFAR-10 and CIFAR-100. The experimental results demonstrate that our method effectively improves the defense performance of the baseline methods, especially against strong adversarial examples generated using more iterations.",
+    "title": "Towards Counteracting Adversarial Perturbations to Resist Adversarial Examples",
+    "authors": [
+      "Haimin ZHANG",
+      "Min Xu"
+    ],
+    "emails": [
+      "~Min_Xu5",
+      "~Haimin_ZHANG1"
+    ],
+    "rank": 2978
+  },
+  {
+    "url": "https://openreview.net/forum?id=DigrnXQNMTe",
+    "decision": "Reject",
+    "ratings": [
+      1,
+      2,
+      3,
+      2
+    ],
+    "rating": "1.94",
+    "confidences": [
+      5,
+      5,
+      4,
+      4
+    ],
+    "abstract": "We propose a generalized probability kernel(GPK) on discrete distributions with finite support. This probability kernel, defined as kernel between distributions instead of samples, generalizes the existing discrepancy statistics such as maximum mean discrepancy(MMD) as well as probability product kernels, and extends to more general cases. For both existing and newly proposed statistics, we estimate them through empirical frequency and illustrate the strategy to analyze the resulting bias and convergence bounds. We further propose power-MMD, a natural extension of MMD in the framework of GPK, illustrating its usage for the task of two-sample test. Our work connects the fields of discrete distribution-property estimation and kernel-based hypothesis test, which might shed light on more new possibilities.",
+    "title": "A generalized probability kernel on discrete distributions and its application in two-sample test",
+    "authors": [
+      "Le Niu"
+    ],
+    "emails": [
+      "~Le_Niu1"
+    ],
+    "rank": 2979
+  },
+  {
+    "url": "https://openreview.net/forum?id=vQgFpfuG3L8",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Despite the growing interest in continual learning (CL) in recent years, continual reinforcement learning (CRL) has still been a challenging task because deep neural networks must infer appropriate action from each never-seen observation from new tasks sustaining old tasks' performance. To solve this problem, some CRL algorithms used both regularization-based methods constraining the weights and replay-based method used in the conventional CL. However, it costs a lot of time to learn because it required a considerable amount of memory in terms of replay-based and have complex regularization terms. In this paper, we propose a simple framework for preserving knowledge FMAL among related sequential tasks, namely Feature Map Attention Loss. Our method leverages general CNNs of a model that can perform all of the sequential tasks well, and the attention mechanism is used to extract essential features to transfer. Also, FMAL uses both the regularization method and the replay-based method, like the existing CRL method. However, the amount of memory required for learning is much smaller, and the term of regularization is relatively simple. We evaluate FMAL with the state of the art algorithms. The experiment results show that our method outperforms these baselines with higher rewards.",
+    "title": "Continual Reinforcement Learning using Feature Map Attention Loss",
+    "authors": [
+      "Ho-Taek Joo",
+      "KyungJoong Kim"
+    ],
+    "emails": [
+      "~KyungJoong_Kim1",
+      "~Ho-Taek_Joo2"
+    ],
+    "rank": 2980
+  },
+  {
+    "url": "https://openreview.net/forum?id=-UOq2-C27Yc",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "This paper proposes a new mean-field framework for over-parameterized deep neural networks (DNNs), which can be used to analyze neural network training. In this framework, a DNN is represented by probability measures and functions over its features (that is, the function values of the hidden units over the training data) in the continuous limit, instead of the neural network parameters as most existing studies have done. This new representation overcomes the degenerate situation where all the hidden units essentially have only one meaningful hidden unit in each middle layer, leading to a simpler representation of DNNs. Moreover, we construct a non-linear dynamics called neural feature flow, which captures the evolution of an over-parameterized DNN trained by Gradient Descent. We illustrate the framework via the Residual Network (Res-Net) architecture. It is shown that when the neural feature flow process converges, it reaches a global minimal solution under suitable conditions. Our analysis leads to the first global convergence proof for over-parameterized neural network training with more than <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mn class=\"mjx-n\"><mjx-c class=\"mjx-c33\"></mjx-c></mjx-mn></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mn>3</mn></math></mjx-assistive-mml></mjx-container> layers in the mean-field regime.",
+    "title": "Modeling from Features: a Mean-field Frameworkfor Over-parameterized Deep Neural Networks",
+    "authors": [],
+    "emails": [],
+    "rank": 2981
+  },
+  {
+    "url": "https://openreview.net/forum?id=fUOpVdxMJA0",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "The framework for Simulation of Human and Artificial Emotion (SHArE) describes the architecture of emotion in terms of parameters transferable between psychology, neuroscience, and artificial intelligence. These parameters can be defined as abstract concepts, as with structural equation modeling or granularized down to the voltage levels of individual neurons. This model enables emotional trajectory design for humans which may lead to novel therapeutic solutions for various mental health concerns. For artificial intelligence, this work provides a compact notation which can be applied to neural networks as a means to observe the emotions and motivations of machines.",
+    "title": "Simulation of Human and Artificial Emotion (SHArE)",
+    "authors": [],
+    "emails": [],
+    "rank": 2982
+  },
+  {
+    "url": "https://openreview.net/forum?id=BgEGeFRGof",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "In our project, we solve the problem of human activity monitoring based on data from sensors attached to the hands of various workers. First of all, the recognition results help to increase labor productivity and optimize production processes at a building site. Also, the analysis of the behavior of workers allows us to track a person's well-being, compliance with safety measures and  accident prevention. \nData collected from the fitness tracker, require careful preprocessing. The Gaussian Process model was applied to fill in the gaps in time series and extract outliers, that increase metrics of the models. The comparison of several models for activity recognition was performed if form of supervised learning. An anomaly detection approach was applied and provided useful results for activity monitoring during construction work. In addition, the neural network based on the architecture of variational autoencoder allowed us to extract main work regimes.\nThe fitness tracker time series data set was collected, tagged and published for further research.",
+    "title": "Anomaly detection and regime searching in fitness-tracker data",
+    "authors": [],
+    "emails": [],
+    "rank": 2983
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y8hOBLJrJLU",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Distributed adaptive stochastic gradient methods have been widely used for large scale nonconvex optimization, such as training deep learning models. However, their iteration complexity on finding <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-stationary points has rarely been analyzed in the nonconvex setting. In this work, we present a novel communication-efficient distributed Adam in the parameter-server model for stochastic nonconvex optimization, dubbed {\\em Efficient-Adam}. Specifically, we incorporate a two-way quantization scheme into Efficient-Adam to reduce the communication cost between the workers and the server. Simultaneously, we adopt a two-way error feedback strategy to reduce the biases caused by the two-way quantization on both the server and workers, respectively. In addition, we establish the iteration complexity for the proposed Efficient-Adam with a class of quantization operators and further characterize its communication complexity between the server and workers when an <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D700 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>\u03b5</mi></math></mjx-assistive-mml></mjx-container>-stationary point is achieved. \nFinally, we solve a toy stochastic convex optimization problem and train deep learning models on real-world vision and language tasks. Extensive experimental results together with a theoretical guarantee justify the merits of Efficient Adam.\n",
+    "title": "Efficient-Adam: Communication-Efficient Distributed Adam with Complexity Analysis",
+    "authors": [
+      "Congliang Chen",
+      "Li Shen",
+      "Haozhi Huang",
+      "Wei Liu",
+      "Zhi-Quan Luo"
+    ],
+    "emails": [
+      "~Congliang_Chen1",
+      "~Wei_Liu3",
+      "~Zhi-Quan_Luo1",
+      "~Haozhi_Huang2",
+      "~Li_Shen1"
+    ],
+    "rank": 2984
+  },
+  {
+    "url": "https://openreview.net/forum?id=PvZqCDCen_E",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Biological data including gene expression data are generally high-dimensional and require efficient, generalizable, and scalable machine-learning methods to discover their complex nonlinear patterns. The recent advances in machine learning can be attributed to deep neural networks (DNNs), which excel in various tasks in terms of computer vision and natural language processing. However, standard DNNs are not appropriate for high-dimensional datasets generated in biology because they have many parameters, which in turn require many samples. In this paper, we propose a DNN-based, nonlinear feature selection method, called the feature selection network (FsNet), for high-dimensional and small number of sample data. Specifically, FsNet comprises a selection layer that selects features and a reconstruction layer that stabilizes the training. Because a large number of parameters in the selection and reconstruction layers can easily result in overfitting under a limited number of samples, we use two tiny networks to predict the large, virtual weight matrices of the selection and reconstruction layers. Experimental results on several real-world, high-dimensional biological datasets demonstrate the efficacy of the proposed method.",
+    "title": "FsNet: Feature Selection Network on High-dimensional Biological Data ",
+    "authors": [
+      "Dinesh Singh",
+      "Hector Clemente",
+      "Mathis Petrovich",
+      "Eiryo Kawakami",
+      "Makoto Yamada"
+    ],
+    "emails": [
+      "~Dinesh_Singh1",
+      "riken.jp",
+      "gmail.com",
+      "~Makoto_Yamada3"
+    ],
+    "rank": 2985
+  },
+  {
+    "url": "https://openreview.net/forum?id=Ofjd8AkVExe",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Anomaly detection is an important task in many traffic applications. Methods based on convolutional neural networks reach state-of-the-art accuracy; however, they typically rely on supervised training with large labeled data and the trained network is only applicable to the intersection that the training data are collected from. Considering that anomaly data are generally hard to obtain, we present data transformation methods for converting data obtained from one intersection to other intersections to mitigate the effort of training data collection. We demonstrate our methods on the task of anomalous trajectory detection and leverage an unsupervised method that require only normal trajectories for network training. We proposed a general model and a universal model for our transformation methods. The general model focuses on saving data collection effort; while the universal model aims at training a universal network for being used by other intersections. We evaluated our methods on the dataset with trajectories collected from GTA V virtual world. The experimental results show that with significant reduction in data collecting and network training efforts, our methods still can achieve state-of-the-art accuracy for anomalous trajectory detection.",
+    "title": "Data Transformer for Anomalous Trajectory Detection",
+    "authors": [],
+    "emails": [],
+    "rank": 2986
+  },
+  {
+    "url": "https://openreview.net/forum?id=Y9lsZcQ2Fcc",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": " A formal description of the compositionality of neural networks is associated directly with the formal grammar-structure of the objects it seeks to represent. This formal grammar-structure specifies the kind of components that make up an object, and also the configurations they are allowed to be in. In other words, objects can be described as a parse-tree of its components - a structure that can be seen as a candidate for building connection-patterns among neurons in neural networks. We present a formal grammar description of convolutional neural networks and capsule networks that shows how capsule networks can enforce such parse-tree structures, while CNNs do not. Specifically, we show that the entropy of routing coefficients in the dynamic routing algorithm controls this ability. Thus, we introduce the entropy of routing weights as a loss function for better compositionality among capsules. We show by experiments, on data with a compositional structure, that the use of this loss enables capsule networks to better detect changes in compositionality.  Our experiments show that as the entropy of the routing weights increases, the ability to detect changes in compositionality reduces. We see that, without routing, capsule networks perform similar to convolutional neural networks in that both these models perform badly at detecting changes in compositionality. Our results indicate that routing is an important part of capsule networks - effectively answering recent work that has questioned its necessity. We also, by experiments on SmallNORB, CIFAR-10, and FashionMNIST, show that this loss keeps the accuracy of capsule network models  comparable to models that do not use it . ",
+    "title": "Learning compositional structures for deep learning: why routing-by-agreement is necessary",
+    "authors": [
+      "Sairaam Venkatraman",
+      "Ankit Anand",
+      "S Balasubramanian",
+      "Raghunatha Sarma Rachakonda"
+    ],
+    "emails": [
+      "~Raghunatha_Sarma_Rachakonda1",
+      "gmail.com",
+      "~Sairaam_Venkatraman1",
+      "~S_Balasubramanian1"
+    ],
+    "rank": 2987
+  },
+  {
+    "url": "https://openreview.net/forum?id=Enb37i4iMry",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "We propose an unsupervised inference algorithm to find optimal deep feedforward neural network architecture by modeling hierarchical representations of given data set. Our algorithm learns the optimal neural network architecture that represents a system that generates observed data, in a forward manner without backpropagation. We hypothesize that a neural network architecture, which models the hierarchical representations of given data, provides the optimal feedforward neural network architecture with competitive predictive performance in a supervised manner when compared with supervised-based neural network architecture optimization models. To prove the hypothesis, we evaluated the predictive performance with the feedforward neural network architectures with intensive experiments using various well-known benchmark data sets, such as SPECT, FMNIST, CIFAR-10, heart disease, German Credit, and Statlog heart data. Our algorithm requires much smaller search space for feasible computation than conventional supervised-based neural network architecture optimization methods. The proposed unsupervised inference approach can fully enjoy optimizing neural network architecture from small size of labeled data, when a huge unlabeled data is available. ",
+    "title": "Unsupervised inference for optimizing deep feedforward neural network architecture",
+    "authors": [],
+    "emails": [],
+    "rank": 2988
+  },
+  {
+    "url": "https://openreview.net/forum?id=V14Jn1pBRCB",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Program synthesis has recently emerged as a promising approach to the image parsing task. Most prior work has followed a two-step pipeline that involves supervised pretraining of a sequence-to-sequence model on synthetic programs, followed by reinforcement learning to fine-tune the model on real reference images. A purely RL-driven approach without supervised pretraining has never proven successful, since useful programs are too sparse in the program space to sample from. In this paper, we present the first purely unsupervised learning algorithm that parses constructive solid geometry (CSG) images into context-free grammar (CFG) without pretraining. The key ingredients of our approach include entropy regularization and sampling without replacement from the CFG syntax tree, both of which encourage exploration of the search space, and a grammar-encoded tree LSTM that enforces valid outputs. We demonstrate that our unsupervised method can generalize better than supervised training on a synthetic 2D CSG dataset. Additionally, we find that training in this way naturally optimizes the quality of the top-<mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D458 TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>k</mi></math></mjx-assistive-mml></mjx-container> programs, leading our model to outperform existing method on a 2D computer aided design (CAD) dataset under beam search.",
+    "title": "Unsupervised Program Synthesis for Images By Sampling Without Replacement",
+    "authors": [
+      "Chenghui Zhou",
+      "Chun-Liang Li",
+      "Barnab\u00e1s P\u00f3czos"
+    ],
+    "emails": [
+      "~Barnab\u00e1s_P\u00f3czos1",
+      "~Chenghui_Zhou1",
+      "~Chun-Liang_Li1"
+    ],
+    "rank": 2989
+  },
+  {
+    "url": "https://openreview.net/forum?id=CrY1vHr_wHC",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "With the excessive use of neural networks in safety critical domains the need for understandable explanations of their predictions is rising. Several methods were developed which identify the most relevant inputs, such as sensitivity analysis and most prominently layerwise relevance propagation (LRP). \nIt has been shown that the noise in the explanations from the sensitivity analysis can be heavily reduced by averaging over noisy versions of the input image, a method referred to as SmoothGrad. \nWe investigate the application of the same principle to LRP and find that it smooths the resulting relevance function leading to improved explanations for state-of-the-art LRP rules. The method, that we refer to as SmoothLRP, even produces good explanations on poorly trained neural networks, where former methods show unsatisfactory results. Interestingly, we observed, that SmoothLRP can also be applied to the identification of adversarial examples. ",
+    "title": "SmoothLRP: Smoothing Explanations of Neural Network Decisions by Averaging over Stochastic Input Variations",
+    "authors": [
+      "Arne Peter Raulf",
+      "Ben Luis Hack",
+      "Sina D\u00e4ubener",
+      "Axel Mosig",
+      "Asja Fischer"
+    ],
+    "emails": [
+      "~Sina_D\u00e4ubener1",
+      "rub.de",
+      "~Asja_Fischer1",
+      "~Axel_Mosig1"
+    ],
+    "rank": 2990
+  },
+  {
+    "url": "https://openreview.net/forum?id=SGd00Q1lRrw",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Artificial systems currently outperform humans in diverse computational domains, but none has  achieved parity in speed and overall versatility of mastering novel tasks.  A critical component to human success, in this regard, is the ability to redeploy and redirect  data passed between cognitive subsystems (via abstract feature representations) in response to changing task demands. The present work formalizes this coordination procedure in terms of multiaffine functions of stimulus and control states.  In experiments, the resulting model robustly predicts behavior and performance of multitasking networks on natural language data (MNIST) using common deep  network architectures.  Consistent with existing theory in cognitive control,  representation structure varies in response to (a) environmental pressures for representation sharing, (b) demands for parallel processing capacity, and (c) tolerance for crosstalk.  Implications for  geometric (dimension, curvature), functional (automaticity, generalizability, modularity), and applied aspects of representation learning are discussed.",
+    "title": "Multiaffine representations mediate tradeoff between generalization and parallel processing capacity in networks trained to multitask",
+    "authors": [
+      "Gregory Henselman-Petrusek",
+      "Tyler Giallanza",
+      "Sebastian Musslick",
+      "Jonathan Cohen"
+    ],
+    "emails": [
+      "~Sebastian_Musslick1",
+      "~Gregory_Henselman-Petrusek1",
+      "~Jonathan_Cohen1",
+      "~Tyler_Giallanza1"
+    ],
+    "rank": 2991
+  },
+  {
+    "url": "https://openreview.net/forum?id=510f7KAPmYR",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "We propose an efficient neural architecture search (NAS) algorithm with a flexible search space that encompasses layer operations down to individual weights. This work addresses NAS challenges in a search space of weight connections within layers, specifically the large number of architecture variations compared to a high-level search space with predetermined layer types. Our algorithm continuously evolves network architecture by adding new candidate parameters (weights and biases) using a first-order estimation based on their gradients at 0. Training is decoupled into alternating steps: adjusting network weights holding architecture constant, and adjusting network architecture holding weights constant. We explore additional applications by extend this method for multi-task learning with shared parameters. On the CIFAR-10 dataset, our evolved network achieves an accuracy of 97.42\\% with 5M parameters, and 93.75\\% with 500K parameters. On the ImageNet dataset, we achieve 76.6\\% top-1 and 92.5\\% top-5 accuracy with a search restriction of 8.5M parameters.",
+    "title": "Intra-layer Neural Architecture Search",
+    "authors": [
+      "Dong Kai Wang",
+      "Nam Sung Kim"
+    ],
+    "emails": [
+      "~Dong_Kai_Wang1",
+      "~Nam_Sung_Kim1"
+    ],
+    "rank": 2992
+  },
+  {
+    "url": "https://openreview.net/forum?id=vUe3AjTrD1",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "The representation of grid cells in the medial entorhinal cortex (MEC) region is crucial for path integration. In this paper, we proposed a grid cell modeling mechanism by mapping the agent\u2019s self-motion in Euclidean space to the neuronal activity of grid cells. Our representational model can achieve hexagonal patterns of grid cells from recurrent neural network (RNN) and enables multi-scale path integration for 1D, 2D and 3D spaces. Different from the existing works which need to learn weights of RNN to get the vector representation of grid cells, our method can obtain weights by direct matrix operations. Moreover, compared with the classical models based on continuous attractor network (CAN), our model avoids the connection matrix\u2019s symmetry limitation and spatial representation redundancy problems. In this paper, we also discuss the connection pattern between grid cells and place cells to demonstrate grid cells\u2019  functioning as a metric for coding space. ",
+    "title": "Grid cell modeling with mapping representation of self-motion for path integration",
+    "authors": [],
+    "emails": [],
+    "rank": 2993
+  },
+  {
+    "url": "https://openreview.net/forum?id=VDUovuK0gV",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "We propose a learning method that, dynamically modifies the time-constants of the continuous-time counterpart of a vanilla RNN. The time-constants are modified based on the current observation and hidden state. Our proposal overcomes the issues of RNN trainability, by mitigating exploding and vanishing gradient phenomena based on placing novel constraints on the parameter space, and by suppressing noise in inputs based on pondering over informative inputs to strengthen their contribution in the hidden state. As a result, our method is computationally efficient overcoming overheads of many existing methods that also attempt to improve RNN training. Our RNNs, despite being simpler and having light memory footprint, shows competitive performance against standard LSTMs and baseline RNN models on many benchmark datasets including those that require long-term memory.\n",
+    "title": "Time Adaptive Recurrent Neural Network",
+    "authors": [
+      "Anil Kag",
+      "Venkatesh Saligrama"
+    ],
+    "emails": [
+      "~Anil_Kag1",
+      "~Venkatesh_Saligrama1"
+    ],
+    "rank": 2994
+  },
+  {
+    "url": "https://openreview.net/forum?id=hrpSB_rzQTU",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Building a good speech recognition system usually requires large amounts of transcribed data, which is expensive to collect. To tackle this problem, many unsupervised pre-training methods have been proposed. Among these methods, Masked Predictive Coding achieved significant improvements on various speech recognition datasets with BERT-like Masked Reconstruction loss and Transformer backbone. However, many aspects of MPC have not been fully investigated. In this paper, we conduct a further study on MPC and focus on three important aspects: the effect of pre-training data speaking style, its extension on streaming model, and how to better transfer learned knowledge from pre-training stage to downstream tasks. Experiments reveled that pre-training data with matching speaking style is more useful on downstream recognition tasks. A unified training objective with APC and MPC provided 8.46% relative error reduction on streaming model trained on HKUST. Also, the combination of target data adaption and layer-wise discriminative training helped the knowledge transfer of MPC, which achieved 3.99% relative error reduction on AISHELL over a strong baseline. ",
+    "title": "A Further Study of Unsupervised Pre-training for Transformer Based Speech Recognition",
+    "authors": [],
+    "emails": [],
+    "rank": 2995
+  },
+  {
+    "url": "https://openreview.net/forum?id=cDlp2bO6L5b",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "We study the transfer learning process between two linear regression problems. An important and timely special case is when the regressors are overparameterized and perfectly interpolate their training data. We examine a parameter transfer mechanism whereby a subset of the parameters of the target task solution are constrained to the values learned for a related source task. We analytically characterize the generalization error of the target task in terms of the salient factors in the transfer learning architecture, i.e., the number of examples available, the number of (free) parameters in each of the tasks, the number of parameters transferred from the source to target task, and the correlation between the two tasks. Our non-asymptotic analysis shows that the generalization error of the target task follows a two-dimensional double descent trend (with respect to the number of free parameters in each of the tasks) that is controlled by the transfer learning factors. Our analysis points to specific cases where the transfer of parameters is beneficial.",
+    "title": "Double Double Descent: On Generalization Errors in Transfer Learning between Linear Regression Tasks",
+    "authors": [
+      "Yehuda Dar",
+      "Richard Baraniuk"
+    ],
+    "emails": [
+      "~Richard_Baraniuk1",
+      "~Yehuda_Dar1"
+    ],
+    "rank": 2996
+  },
+  {
+    "url": "https://openreview.net/forum?id=CBYttzvzAS",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Policy gradient methods are built on the policy gradient theorem, which involves a term representing the complete sum of rewards into the future: the return. Due to this, one usually either waits until the end of an episode before performing updates, or learns an estimate of this return--a so-called critic. Our emphasis is on the first approach in this work, detailing an incremental policy gradient update which neither waits until the end of the episode, nor relies on learning estimates of the return. We provide on-policy and off-policy variants of our algorithm, for both the discounted return and average reward settings. Theoretically, we draw a connection between the traces our methods use and the stationary distributions of the discounted and average reward settings. We conclude with an experimental evaluation of our methods on both simple-to-understand and complex domains.",
+    "title": "Incremental Policy Gradients for Online Reinforcement Learning Control",
+    "authors": [
+      "Kristopher De Asis",
+      "Alan Chan",
+      "Yi Wan",
+      "Richard S. Sutton"
+    ],
+    "emails": [
+      "~Richard_S._Sutton1",
+      "~Alan_Chan2",
+      "~Kristopher_De_Asis1",
+      "~Yi_Wan1"
+    ],
+    "rank": 2997
+  },
+  {
+    "url": "https://openreview.net/forum?id=60GDNLY-m3M",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Consistency-based method has been proved effective for semi-supervised learning (SSL). However, the impact of the pseudo-labeled samples' quality as well as the mining strategies for high quality training sample have rarely been studied in SSL. An intuitive idea is to select pseudo-labeled training samples by threshold. We find it essential the selection of these thresholds to the final result of SSL. Following this discovery, we propose SEAT (Score Ensemble with Adaptive Threshold), a simple and efficient semi-supervised learning object detection method, in which the high confidence pseudo-labels are selected for self-training. Apart from confidence score as the indicator of the sample's quality, we also introduce the scores of temporal consistency and augmentation consistency. The scores provide a more comprehensive description to the quality of each sample. To cope with the data distribution difference among categories, the adaptive threshold strategy is used to automatically determine the sample mining threshold for each category. We conduct experiments on PASCAL-VOC and MSCOCO, extensive results show that our method is competitive and can be easily combined with consistency-based methods.",
+    "title": "Rethinking Pseudo-labeled Sample Mining for Semi-Supervised Object Detection",
+    "authors": [
+      "Duo Li",
+      "Sanli Tang",
+      "Zhanzhan Cheng",
+      "Shiliang Pu",
+      "Yi Niu",
+      "Wenming Tan",
+      "Fei Wu",
+      "Xiaokang Yang"
+    ],
+    "emails": [
+      "~Yi_Niu1",
+      "~Xiaokang_Yang1",
+      "~Sanli_Tang1",
+      "~Zhanzhan_Cheng1",
+      "~Shiliang_Pu1",
+      "hikvision.com",
+      "~Wenming_Tan1",
+      "~Fei_Wu2"
+    ],
+    "rank": 2998
+  },
+  {
+    "url": "https://openreview.net/forum?id=aqjdsJAGnw6",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "  Program synthesis is the task to automatically generate programs   based on user specification.   In this paper, we present a framework that synthesizes programs from images of flow chart that serve   as accurate and intuitive specifications. In order doing so, we propose a deep neural network called FCR that recognizes   graph structure from its image.    FCR is trained end-to-end, which can predict edges and nodes    of the flow chart simultaneously.   Experiments show that the accuracy to synthesize a program is 66.4%, and the accuracy to recognize edges and nodes are 94.1% and  67.9%, respectively. On average, it takes about 60 milliseconds to synthesize a program. \n",
+    "title": "FCR: Flow Chart Recognition Network for Program Synthesis",
+    "authors": [],
+    "emails": [],
+    "rank": 2999
+  },
+  {
+    "url": "https://openreview.net/forum?id=BUCHknhWq8D",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Sparse regression problems have traditionally been solved using all available measurements simultaneously. However, this approach fails in challenging scenarios such as when the noise level is high or there are missing data / adversarial samples. \nWe propose JOBS (Joint-Sparse Optimization via Bootstrap Samples) -- a \\emph{collaborative} sparse-regression framework on bootstrapped samples from the pool of available measurements via a joint-sparse constraint to ensure support consistency. In comparison to traditional bagging which solves sub-problems in an \\emph{independent} fashion across bootstrapped samples, JOBS achieves state-of-the-art performance with the added advantage of having a sparser solution while requiring a lower number of observation samples.\n\nAnalysis of theoretical performance limits is employed to determine critical optimal parameters: the number of bootstrap samples <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"0\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43E TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>K</mi></math></mjx-assistive-mml></mjx-container> and the number of elements <mjx-container class=\"MathJax CtxtMenu_Attached_0\" jax=\"CHTML\" role=\"presentation\" tabindex=\"0\" ctxtmenu_counter=\"1\" style=\"font-size: 121.3%; position: relative;\"><mjx-math class=\"MJX-TEX\" aria-hidden=\"true\"><mjx-mi class=\"mjx-i\"><mjx-c class=\"mjx-c1D43F TEX-I\"></mjx-c></mjx-mi></mjx-math><mjx-assistive-mml role=\"presentation\" unselectable=\"on\" display=\"inline\"><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>L</mi></math></mjx-assistive-mml></mjx-container> in each bootstrap sample. Theoretical results indicate a better bound than Bagging (i.e. higher probability of achieving the same or better performance). Simulation results are used to validate this parameter selection. JOBS is robust to adversarial samples that fool the baseline method, as shown by better generalization in an image reconstruction task where the adversary has similar occlusions or alignment as the test sample. Furthermore, JOBS also improves discriminative performance in a facial recognition task in a sparse-representation-based classification setting.",
+    "title": "Sparse Recovery via Bootstrapping:  Collaborative or Independent?",
+    "authors": [],
+    "emails": [],
+    "rank": 3000
+  },
+  {
+    "url": "https://openreview.net/forum?id=uIipjoThjAH",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Spiking Neural Networks (SNNs) have recently emerged as an alternative to deep learning owing to sparse, asynchronous and binary event (or spike) driven processing, that can yield huge energy efficiency benefits on neuromorphic hardware. Most existing approaches to create SNNs either convert the weights from pre-trained Artificial Neural Networks (ANNs) or directly train SNNs with surrogate gradient backpropagation. Each approach presents its pros and cons. The ANN-to-SNN conversion method requires at least hundreds of time-steps for inference to yield competitive accuracy that in turn reduces the energy savings. Training SNNs with surrogate gradients from scratch reduces the latency or total number of time-steps, but the training becomes slow/problematic and has convergence issues. Thus, the latter approach of training SNNs has been limited to shallow networks on simple datasets. To address this training issue in SNNs, we revisit batch normalization and propose a  temporal Batch Normalization Through Time (BNTT) technique. Most prior SNN works till now have disregarded batch normalization deeming it ineffective for training temporal SNNs. Different from previous works, our proposed BNTT decouples the parameters in a BNTT layer along the time axis to capture the temporal dynamics of spikes. The temporally evolving learnable parameters in BNTT allow a neuron to control its spike rate through different time-steps, enabling low-latency and low-energy training from scratch. We conduct experiments on CIFAR-10, CIFAR-100, Tiny-ImageNet and event-driven DVS-CIFAR10 datasets. BNTT allows us to train deep SNN architectures from scratch, for the first time, on complex datasets with just few 25-30 time-steps. We also propose an early exit algorithm using the distribution of parameters in BNTT to reduce the latency at inference, that further improves the energy-efficiency. Code will be made available.",
+    "title": "Revisiting Batch Normalization for Training Low-latency Deep Spiking Neural Networks from Scratch",
+    "authors": [],
+    "emails": [],
+    "rank": 3001
+  },
+  {
+    "url": "https://openreview.net/forum?id=68NusZqF0da",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Mathematical reasoning task is a subset of the natural language question answering task. Several approaches have been proposed in existing work to solve mathematical reasoning problems. Among them, the two-phase solution to first predict formulas from questions and then calculate answers from formulas has achieved desirable performance. However, this design results in the reliance on annotated formulas as the intermediate labels for training. In this work, we put forward a brand-new idea to enable the models to explore the formulas by themselves to eliminate the reliance on formula annotations. To realize this, we proposed Weakly Supervised Formula Leaner, a learning framework that can autonomously search for the optimal formulas through the training process and continuously update itself. Our experiment is conducted on a typical mathematical dataset MathQA. The result shows that our models learning with weak supervision outperform the baseline methods.",
+    "title": "Weakly Supervised Formula Learner for Solving Mathematical Problems",
+    "authors": [
+      "Yuxuan Wu",
+      "Hideki Nakayama"
+    ],
+    "emails": [
+      "~Hideki_Nakayama1",
+      "~Yuxuan_Wu1"
+    ],
+    "rank": 3002
+  },
+  {
+    "url": "https://openreview.net/forum?id=IT2s2Ub6skl",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Analysis of the human microbiome using metagenomic sequencing data has demonstrated high ability in discriminating various human diseases. Raw metagenomic sequencing data require multiple complex and computationally heavy bioinformatics steps prior to data analysis. Such data contain millions of short sequences read from the fragmented DNA sequences and are stored as fastq files. Conventional processing pipelines consist multiple steps including quality control, filtering, alignment of sequences against genomic catalogs (genes, species, taxonomic levels, functional pathways, etc.). These pipelines are complex to use, time consuming and rely on a large number of parameters that often provide variability and impact the estimation of the microbiome elements. \nRecent studies have demonstrated that training Deep Neural Networks directly from raw sequencing data is a promising approach to bypass some of the challenges associated with mainstream bioinformatics pipelines. Most of these methods use the concept of word and sentence embeddings that create a meaningful and numerical representation of DNA sequences, while extracting features and reducing the dimentionality of the data. \nIn this paper we present an end-to-end approach that classifies patients into disease groups directly from raw metagenomic reads: metagenome2vec. This approach is composed of four steps (i) generating a vocabulary of k-mers and learning their numerical embeddings; (ii) learning DNA sequence (read) embeddings; (iii) identifying the genome from which the sequence is most likely to come and (iv) training a multiple instance learning classifier which predicts the phenotype based on the vector representation of the raw data. An attention mechanism is applied in the network so that the model can be interpreted, assigning a weight to the influence of the prediction for each genome.\nUsing two public real-life data-sets as well a simulated one, we demonstrated that this original approach reached very high performances, comparable with the state-of-the-art methods applied directly on processed data though mainstream bioinformatics workflows. These results are encouraging for this proof of concept work. We believe that with further dedication, the DNN models have the potential to surpass mainstream bioinformatics workflows in disease classification tasks.",
+    "title": "Towards end-to-end disease prediction from raw metagenomic data",
+    "authors": [],
+    "emails": [],
+    "rank": 3003
+  },
+  {
+    "url": "https://openreview.net/forum?id=c_YKhwEdUcq",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "We describe a neural-based method for generating exact or approximate solutions to differential equations in the form of mathematical expressions. Unlike other neural methods, our system returns symbolic expressions that can be interpreted directly. Our method uses a neural architecture for learning mathematical expressions to optimize a customizable objective, and is scalable, compact, and easily adaptable for a variety of tasks and configurations. The system has been shown to effectively find exact or approximate symbolic solutions to various differential equations with applications in natural sciences. In this work, we highlight how our method applies to partial differential equations over multiple variables and more complex boundary and initial value conditions. ",
+    "title": "A neural method for symbolically solving partial differential equations",
+    "authors": [],
+    "emails": [],
+    "rank": 3004
+  },
+  {
+    "url": "https://openreview.net/forum?id=LO7tSIUIub",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "We show analytically that training a neural network by stochastic mutation or neuroevolution of its weights is equivalent, in the limit of small mutations, to gradient descent on the loss function in the presence of Gaussian white noise. Averaged over independent realizations of the learning process, neuroevolution is equivalent to gradient descent on the loss function. We use numerical simulation to show that this correspondence can be observed for finite mutations. Our results provide a connection between two distinct types of neural-network training, and provide justification for the empirical success of neuroevolution.",
+    "title": "Correspondence between neuroevolution and gradient descent",
+    "authors": [],
+    "emails": [],
+    "rank": 3005
+  },
+  {
+    "url": "https://openreview.net/forum?id=aL00mtbUoal",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Contrastive self-supervised learning (CSL) is an approach to learn useful representations by solving a pretext task which selects and compares anchor, negative and positive (APN) features from an unlabeled dataset. We present a conceptual framework which characterizes CSL approaches in five aspects (1) data augmentation pipeline, (2) encoder selection, (3) representation extraction, (4) similarity measure, and (5) loss function. We analyze three leading CSL approaches--AMDIM, CPC and SimCLR--, and show that despite different motivations, they are special cases under this framework. We show the utility of our framework by designing Yet Another DIM (YaDIM) which achieves competitive results on CIFAR-10, STL-10 and ImageNet, and is more robust to the choice of encoder and the representation extraction strategy. To support ongoing CSL research, we release the PyTorch implementation of this conceptual framework along with standardized implementations of AMDIM, CPC (V2), SimCLR, BYOL, Moco (V2) and YADIM.",
+    "title": "A Framework For Contrastive Self-Supervised Learning And Designing A New Approach",
+    "authors": [
+      "William Alejandro Falcon",
+      "Kyunghyun Cho"
+    ],
+    "emails": [
+      "~Kyunghyun_Cho1",
+      "~William_Alejandro_Falcon1"
+    ],
+    "rank": 3006
+  },
+  {
+    "url": "https://openreview.net/forum?id=vKYP0XhN_dq",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Classification tasks require balanced distribution of data in order to ensure the learner to be trained to generalize over all classes. In realistic settings, however, the number of instances vary substantially among classes. This typically leads to a learner that promotes bias towards the majority group due to its dominating property. Therefore, methods to handle imbalanced data is crucial for alleviating distributional skews and fully utilizing the under-represented data. We propose a novel training method, Sequential Targeting, that forces an incremental learning setting by splitting the data into mutually exclusive subset and adaptively balancing the data distribution as tasks develop. To address problems that arise within incremental learning, we apply dropout and elastic weight consolidation with our method. It is demonstrated in a variety of experiments on both text and image dataset (IMDB, CIFAR-10, MNIST) and has proven its superiority over traditional methods such as oversampling and under-sampling.",
+    "title": "Learning to Balance with Incremental Learning",
+    "authors": [
+      "Joel Jang",
+      "Yoonjeon Kim",
+      "Jaewoo Kang"
+    ],
+    "emails": [
+      "~Yoonjeon_Kim1",
+      "~Joel_Jang1",
+      "~Jaewoo_Kang1"
+    ],
+    "rank": 3007
+  },
+  {
+    "url": "https://openreview.net/forum?id=zmCmuqWh14L",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Off-target predictions are crucial in gene editing research to improve existing prediction methods. Recently, significant progress has been achieved in the field of prediction of off-target mutations, particularly with CRISPR-Cas9 data, thanks to the use of deep learning. CRISPR-Cas9 is a precise gene editing technique allowing manipulations of DNA fragments. The encoding of sgRNA-DNA sequences for deep neural networks is a complex process, which impacts significantly the prediction accuracy. In this context, we propose a novel encoding of sgRNA-DNA sequences that is capable to aggregate the involved sequence data without any loss of information. In our experiments, we compare our novel encoding with the state-of-the-art sgRNA-DNA encoding. We demonstrate the superior accuracy of our approach in our simulations involving Feedforward Neural Networks (FFNs) and Convolutional Neural Networks (CNNs). We highlight the universality of our results by building several FFNs and CNNs with various layer depths and performing predictions on two popular public gene editing data sets, the CRISPOR data set and the GUIDE-seq data set. In all our experiments, the new encoding led to more accurate off-target prediction results, providing an improvement of the area under the Receiver Operating Characteristic (ROC) curve up to 35\\%. ",
+    "title": "Novel Encoding of sgRNA-DNA Sequences for Accurate Deep Learning Off-Target Predictions",
+    "authors": [
+      "Jeremy Charlier",
+      "Vladimir Makarenkov"
+    ],
+    "emails": [
+      "uqam.ca",
+      "~Jeremy_Charlier2"
+    ],
+    "rank": 3008
+  },
+  {
+    "url": "https://openreview.net/forum?id=Hy8JM_Fvt5N",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Binary networks are extremely efficient as multiplications and additions are replaced by bit shifts. Yet, binarizing network models reduces their representational power: a binary weight with a value of -1 or +1 cannot represent as much information as a real weight. We make the observation that pruning weights adds the value 0 as an additional symbol and thus increases the information capacity of the network. This increases the solution space of our network -- more network configurations are possible. Thus far, all hypothesis are considered equally likely. Yet, given that the network is binary, by assuming a Bernoulli prior over the weights, we restricts the hypothesis space to only the ones that can be effectively encoded in a binary network. We show that this view leads to maximizing the information capacity over the binary weights. In this work we propose to jointly prune binary weights and maximize the information capacity, thus finding a subnetwork that performs better than the original network. On 3 datasets and 11 architectures we show compact models with good accuracy comparing favorably to state-of-the-art. \n",
+    "title": "Less bits is more: How pruning deep binary networks increases weight capacity",
+    "authors": [
+      "Yunqiang Li",
+      "Silvia Laura Pintea",
+      "Jan van Gemert"
+    ],
+    "emails": [
+      "~Silvia_Laura_Pintea1",
+      "~Jan_van_Gemert1",
+      "~Yunqiang_Li1"
+    ],
+    "rank": 3009
+  },
+  {
+    "url": "https://openreview.net/forum?id=EEBhskS0Gzt",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "A number of training protocols in machine learning seek to enhance learning efficacy by training a single agent on multiple tasks in sequence. Sequential acquisition exploits the discovery of common structure between tasks in the form of shared representations to improve learning speed and generalization. The learning of shared representations, however, is known to impair the execution of multiple tasks in parallel. The parallel execution of tasks results in higher efficiency of processing and is promoted by separating representations between tasks to avoid processing interference. Here, we build on previous work involving shallow networks and simple task settings suggesting that there is a trade-off between learning efficacy and processing efficiency, mediated by the use of shared versus separated representations. We show that the same tension arises in deep networks and discuss a meta-learning algorithm for an agent to manage this trade-off in an unfamiliar environment. We display through different experiments that the agent is able to successfully optimize its training strategy as a function of the environment.",
+    "title": "Navigating the Trade-Off between Learning Efficacy and Processing Efficiency in Deep Neural Networks",
+    "authors": [],
+    "emails": [],
+    "rank": 3010
+  },
+  {
+    "url": "https://openreview.net/forum?id=uf7FZWXJxYQ",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Decentralized optimization problems frequently appear in the large scale machine learning problem. However, few works work on the nonconvex nonsmooth case. In this paper, we give a primal-dual algorithm to solve the nonconvex nonsmooth optimization problem. In addition, to reduce communication overhead, we introduce compression function. We analyze the convergence results of the algorithm and shows the algorithm meets the lower iteration complexity bound. Besides, we conduct two experiments, both of them shows the efficacy of our algorithm. ",
+    "title": "Communication Efficient Primal Dual Algorithm for Nonconvex Nonsmooth Distributed Optimization",
+    "authors": [
+      "Congliang Chen",
+      "Jiawei Zhang",
+      "Li Shen",
+      "Peilin Zhao",
+      "Zhi-Quan Luo"
+    ],
+    "emails": [
+      "~Jiawei_Zhang6",
+      "~Zhi-Quan_Luo1",
+      "~Congliang_Chen1",
+      "~Li_Shen1",
+      "~Peilin_Zhao2"
+    ],
+    "rank": 3011
+  },
+  {
+    "url": "https://openreview.net/forum?id=-FzLjLBTQ1g",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Current practices to apply temperature scaling assume either a fixed, or a manually-crafted dynamically changing schedule. However, our studies indicate that the individual optimal trajectory for each class can change with the context. To this end, we propose context-aware temperature, a generalized approach to provide an individual optimal temperature trajectory over the context for each vocabulary, while allowing the temperature to be learned along with the remaining model parameters during training. Experiment results confirm that the proposed method significantly improves state-of-the-art language models, achieving a perplexity of 19.90 on Penn Treebank, 33.88 on WikiText-2, and 4.7 on WikiText-103.",
+    "title": "Context-Aware Temperature for Language Modeling ",
+    "authors": [
+      "Pei-Hsin Wang",
+      "Sheng-Iou Hsieh",
+      "Shih-Chieh Chang",
+      "Yu-Ting Chen",
+      "Da-Cheng Juan",
+      "Jia-Yu Pan",
+      "Wei Wei"
+    ],
+    "emails": [
+      "~Jia-Yu_Pan1",
+      "~Wei_Wei15",
+      "~Sheng-Iou_Hsieh1",
+      "~Shih-Chieh_Chang1",
+      "~Da-Cheng_Juan1",
+      "~Pei-Hsin_Wang1",
+      "~Yu-Ting_Chen2"
+    ],
+    "rank": 3012
+  },
+  {
+    "url": "https://openreview.net/forum?id=hz_l6dFqfQ",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "The nodes of a graph existing in a specific cluster are more likely to connect to each other than with other nodes in the graph. Then revealing some information about the nodes, the structure of the graph (the graph edges) provides this opportunity to know more information about the other nodes. From this perspective, this paper revisits the node classification task in a semi-supervised scenario by graph convolutional neural network. The goal is to benefit from the flow of information that circulates around the revealed node labels. For this aim, this paper provides a new graph convolutional neural network architecture. This architecture benefits efficiently from the revealed training nodes, the node features, and the graph structure. On the other hand, in many applications, non-graph observations (side information) exist beside a given graph realization. The non-graph observations are usually independent of the graph structure. This paper shows that the proposed architecture is also powerful in combining a graph realization and independent non-graph observations. For both cases, the experiments on the synthetic and real-world datasets demonstrate that our proposed architecture achieves a higher prediction accuracy in comparison to the existing state-of-the-art methods for the node classification task.",
+    "title": "New GCNN-Based Architecture for Semi-Supervised Node Classification",
+    "authors": [],
+    "emails": [],
+    "rank": 3013
+  },
+  {
+    "url": "https://openreview.net/forum?id=U9fS7LlB6b",
+    "ratings": [],
+    "rating": " N/A",
+    "confidences": [],
+    "abstract": "Deep Neural networks are susceptible to adversarial attacks: if inputs are perturbed in a specific manner, it can result in misclassification. However, recent studies have shown that the robustness of the network has certain connection with the complexity of the architecture. In this paper, we investigate the distinctive influence of additional convolutional layers at the decision boundaries of several DNN architectures. To generate adversarial examples, we have utilized two different attack mechanisms, Fast Gradient Method (FGM) and One Step Spectral Attack (OSSA) using common datasets like MNIST, Fashion MNIST and Cifar 10. We have investigated the aftermaths of adding layers to the robustness of the architecture. For reasoning, we have analyzed separation width from linear class partitions and local geometry (curvature) near the decision boundary. The result shows that model complexity has significant roles in changing distances, as well as, the local features of decision boundary, which impacts the robustness of the network.",
+    "title": "Studying relationship between geometry of decision boundaries with network complexity for robustness analysis",
+    "authors": [],
+    "emails": [],
+    "rank": 3014
+  }
+]
\ No newline at end of file
diff --git a/index.html b/index.html
index 953c6ba..46678a2 100644
--- a/index.html
+++ b/index.html
@@ -11,6 +11,7 @@ <h1>ICLR2018 Open Review Explorer</h1>
     <a style='float:right' href='?conf=iclr2018'>ICLR 2018</a>
     <a style='float:right' href='?conf=iclr2019'>ICLR 2019</a>
     <a style='float:right' href='?conf=iclr2020'>ICLR 2020</a>
+    <a style='float:right' href='?conf=iclr2021'>ICLR 2021</a>
 </div>
 
 <div id="demo">
diff --git a/js/index.js b/js/index.js
index ac5ec47..b2ef3c4 100644
--- a/js/index.js
+++ b/js/index.js
@@ -17,7 +17,7 @@ let request = new XMLHttpRequest();
 
 let conference = getParameterByName('conf');
 if (!conference) {
-  conference = 'iclr2020';
+  conference = 'iclr2021';
 }
 document.querySelector('h1').textContent = `${conference.toUpperCase()} Open Review Explorer`
 console.log(`${conference}.json`)